diff --git a/.gitattributes b/.gitattributes index 8c9edf9b3920f496bfd13070c9efa60592961d6d..9fd5cb3f4b31ed059a55aed8199713901de48e97 100644 --- a/.gitattributes +++ b/.gitattributes @@ -77,3 +77,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/Cython/Compiler/_ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libnvblas.so.11 filter=lfs diff=lfs merge=lfs -text tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn.so.8 filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe new file mode 100644 index 0000000000000000000000000000000000000000..4c236ed0ab8253ceee9276ddc3cf5ed9a7ea6a4c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebc4c06b7d95e74e315419ee7e88e1d0f71e9e9477538c00a93a9ff8c66a6cfc +size 182784 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/constant_folding.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/constant_folding.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e594f596067b98072f4c85548d35b11d97e050aa Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/constant_folding.cpython-311.pyc differ diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codecache.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codecache.py new file mode 100644 index 0000000000000000000000000000000000000000..6e5ec6ab82a30ae8b6b7bf88ff8dddc33d3e53a2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/codecache.py @@ -0,0 +1,2727 @@ +from __future__ import annotations + +import base64 +import copyreg +import dataclasses +import functools +import hashlib +import importlib +import io +import json +import logging +import multiprocessing +import os +import pathlib +import pickle +import pkgutil +import platform +import re +import shlex +import shutil +import signal +import subprocess +import sys +import sysconfig +import tempfile +import textwrap +import threading +import warnings +import weakref +from bisect import bisect_right +from concurrent.futures import Future, ProcessPoolExecutor, ThreadPoolExecutor +from copy import copy +from ctypes import c_void_p, cdll, CDLL +from functools import partial +from pathlib import Path +from threading import Thread +from time import sleep, time +from types import ModuleType +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING, Union + +import torch + +from torch._dynamo.device_interface import ( + get_interface_for_device, + get_registered_device_interfaces, +) +from torch._dynamo.utils import counters, dynamo_timed +from torch._inductor import config, exc, metrics +from torch._inductor.codegen.cuda import cuda_env +from torch._inductor.utils import cache_dir, developer_warning, is_linux +from torch._subclasses.fake_tensor import ( + extract_tensor_metadata, + FakeTensor, + TensorMetadata, +) +from torch.fx.experimental.symbolic_shapes import has_hint, hint_int, ShapeEnv + +if TYPE_CHECKING: + from torch._inductor.graph import GraphLowering + from torch._inductor.select_algorithm import ChoiceCaller + +from torch.hub import _Faketqdm, tqdm + +_HERE = os.path.abspath(__file__) +_TORCH_PATH = os.path.dirname(os.path.dirname(_HERE)) +_LINKER_SCRIPT = os.path.join(_TORCH_PATH, "_inductor/script.ld") + +if config.is_fbcode(): + from triton.fb import build_paths + from triton.fb.build import _run_build_command + + from torch._inductor.fb.utils import ( + log_global_cache_errors, + log_global_cache_stats, + log_global_cache_vals, + use_global_cache, + ) +else: + + def log_global_cache_errors(*args, **kwargs): + pass + + def log_global_cache_stats(*args, **kwargs): + pass + + def log_global_cache_vals(*args, **kwargs): + pass + + def use_global_cache() -> bool: + return False + + +LOCK_TIMEOUT = 600 + +# timing metrics for time spent in the compilation +_cumulative_compile_time = 0.0 +_t0: Optional[float] = None + + +def _compile_start() -> None: + global _t0 + if _t0 is None: + _t0 = time() + + +def _compile_end() -> None: + global _cumulative_compile_time, _t0 + if _t0 is not None: + t1 = time() + _cumulative_compile_time += t1 - _t0 + _t0 = None + # print("CUMULATIVE COMPILE TIME", _cumulative_compile_time) + + +log = logging.getLogger(__name__) + + +def cpp_wrapper_cache_dir(name: str) -> str: + cu_str = ( + "cpu" + if torch.version.cuda is None + else f'cu{torch.version.cuda.replace(".", "")}' + ) + python_version = f"py{sys.version_info.major}{sys.version_info.minor}" + build_folder = f"{python_version}_{cu_str}" + + cpp_wrapper_dir = os.path.join(cache_dir(), build_folder) + cpp_wrapper_build_directory = os.path.join(cpp_wrapper_dir, name) + os.makedirs(cpp_wrapper_build_directory, exist_ok=True) + return cpp_wrapper_build_directory + + +def get_cpp_wrapper_cubin_path_name(): + return "cubin_path" if torch.version.hip is None else "hsaco_path" + + +class CacheBase: + @staticmethod + @functools.lru_cache(None) + def get_system() -> Dict[str, Any]: + try: + import triton + + triton_version = triton.__version__ + except ModuleNotFoundError: + triton_version = None + + try: + system: Dict[str, Any] = { + "device": { + "name": torch.cuda.get_device_properties( + torch.cuda.current_device() + ).name, + }, + "version": { + "cuda": torch.version.cuda, + "triton": triton_version, + }, + } + except (AssertionError, RuntimeError): + # If cuda is not installed, none of the above config is relevant. + system = {} + + system["hash"] = hashlib.sha256( + json.dumps(system, sort_keys=True).encode("utf-8") + ).hexdigest() + + return system + + @staticmethod + @functools.lru_cache(None) + def get_local_cache_path() -> Path: + return Path(os.path.join(cache_dir(), "cache", CacheBase.get_system()["hash"])) + + @staticmethod + @functools.lru_cache(None) + def get_global_cache_path() -> Optional[Path]: + return ( + Path(os.path.join(config.global_cache_dir, CacheBase.get_system()["hash"])) + if config.global_cache_dir is not None + else None + ) + + def __init__(self) -> None: + if not torch.cuda.is_available(): + return + + self.system = CacheBase.get_system() + + self.local_cache_path = CacheBase.get_local_cache_path() + self.global_cache_path = CacheBase.get_global_cache_path() + + def get_local_cache(self) -> Dict[str, Any]: + if not self.local_cache_path.is_file(): + return {} + with open(self.local_cache_path) as local_cache_fp: + local_cache = json.load(local_cache_fp) + return local_cache["cache"] + + def update_local_cache(self, local_cache: Dict[str, Any]) -> None: + if not os.path.exists(self.local_cache_path.parent): + os.makedirs(self.local_cache_path.parent, exist_ok=True) + + write_atomic( + str(self.local_cache_path), + json.dumps({"system": self.system, "cache": local_cache}, indent=4), + ) + + +class LocalCache(CacheBase): + def lookup(self, *keys: str) -> Optional[Dict[str, Any]]: + cache = self.get_local_cache() + + sub_cache = cache + for key in keys: + if key in cache: + sub_cache = cache[key] + else: + return None + + return sub_cache + + def set_value(self, *keys: str, value: Any) -> None: + cache = self.get_local_cache() + + sub_cache = cache + for key in keys[0:-1]: + sub_cache.setdefault(key, {}) + sub_cache = sub_cache[key] + sub_cache[keys[-1]] = value + + self.update_local_cache(cache) + + +class PersistentCache(CacheBase): + @functools.lru_cache(None) + def get_global_cache(self): + if self.global_cache_path is None or not self.global_cache_path.is_file(): + return {} + with open(self.global_cache_path) as global_cache_fp: + global_cache = json.load(global_cache_fp) + return global_cache["cache"] + + def lookup( + self, + choices: List[ChoiceCaller], + op: str, + inputs: str, + benchmark: Callable[[Any], Dict[ChoiceCaller, float]], + ) -> Dict[ChoiceCaller, float]: + """ + Check to see if we have benchmarked the given choice callers. For each + choice caller: + + 1. Check global_cache[op][inputs][choice][precision], return benchmark if cached. + 2. Check local_cache[op][inputs][choice][precision], return benchmark if cached. + 3. + a. `max_autotune_gemm=True`: benchmark the choice, update + local_cache[op][inputs][choice], and return the benchmark. + b. `max_autotune_gemm=False`: don't benchmark the choice, return nothing. + """ + precision = torch.get_float32_matmul_precision() + + log_stats = partial(log_global_cache_stats, self.system, op, inputs, precision) + log_vals = partial(log_global_cache_vals, self.system, op, inputs, precision) + log_errors = partial( + log_global_cache_errors, self.system, op, inputs, precision + ) + timings = {} + + def check_cache(cache, callback=None) -> bool: + """Check if `cache` contains data for all the choices""" + hit = True + for choice in choices: + choice_hash = choice.hash_key() + if choice_hash in cache.get(op, {}).get(inputs, {}).get(precision, {}): + # cache hit + timings[choice] = cache[op][inputs][precision][choice_hash] + else: + # cache miss + hit = False + break + if callback: + callback(cached=hit) + return hit + + if config.max_autotune or config.max_autotune_gemm: + local_cache = self.get_local_cache() + # check local cache first since it is data specific to the current machine + if not check_cache(local_cache) and not ( + use_global_cache() + and check_cache(self.get_global_cache(), callback=log_stats) + ): + try: + # re-benchmark everything to try to get consistent numbers from the same machine + timings = benchmark(choices) + assert all(choice in timings for choice in choices) + local_cache.setdefault(op, {}) + local_cache[op].setdefault(inputs, {}).setdefault(precision, {}) + for choice, timing in timings.items(): + local_cache[op][inputs][precision][choice.hash_key()] = timing + except RuntimeError as e: + # catch and log autotuning failures + log_errors(e) + raise e + + self.update_local_cache(local_cache) + + timings_to_log = { + choice.hash_key(): timings[choice] for choice in choices + } + log_vals(timings_to_log) + elif use_global_cache(): + # only check global cache, not local one + check_cache(self.get_global_cache(), callback=log_stats) + # may have a partial cache hit, where not everything is benchmarked + + return timings + + +def get_lock_dir() -> str: + lock_dir = os.path.join(cache_dir(), "locks") + if not os.path.exists(lock_dir): + os.makedirs(lock_dir, exist_ok=True) + return lock_dir + + +def sha256_hash(data: bytes) -> str: + # [:51] to strip off the "Q====" suffix common to every hash value. + return base64.b32encode(hashlib.sha256(data).digest())[:51].decode("utf-8").lower() + + +def code_hash(code: Union[str, bytes], extra: str = ""): + hashing_str = code if isinstance(code, bytes) else code.encode("utf-8") + if extra != "": + hashing_str = hashing_str + b"||" + extra.encode("utf-8") + return "c" + sha256_hash(hashing_str) + + +def get_path( + basename: str, extension: str, specified_dir: str = "" +) -> Tuple[str, str, str]: + if specified_dir: + if os.path.isabs(specified_dir): + subdir = specified_dir + else: + subdir = os.path.join(cache_dir(), specified_dir) + else: + subdir = os.path.join(cache_dir(), basename[1:3]) + path = os.path.join(subdir, f"{basename}.{extension}") + return basename, subdir, path + + +def get_hash(content: Union[str, bytes], extra: str = "", hash_type: str = "code"): + if hash_type == "code": + return code_hash(content, extra) + if hash_type in ["cubin", "hsaco"]: + return code_hash(repr(content)) + raise AssertionError(f"Unknown hash type {hash_type}") + + +def write( + content: Union[str, bytes], + extension: str, + extra: str = "", + hash_type: str = "code", + specified_dir: str = "", +) -> Tuple[str, str]: + # use striped content to compute hash so we don't end up with different + # hashes just because the content begins/ends with differnet number of + # spaces. + key: str = get_hash(content.strip(), extra, hash_type) + basename, subdir, path = get_path(key, extension, specified_dir) + if not os.path.exists(subdir): + os.makedirs(subdir, exist_ok=True) + if not os.path.exists(path): + write_atomic(path, content) + return basename, path + + +def write_atomic(path: str, content: Union[str, bytes]) -> None: + # Write into temporary file first to avoid conflicts between threads + # Avoid using a named temporary file, as those have restricted permissions + assert isinstance( + content, (str, bytes) + ), "Only strings and byte arrays can be saved in the cache" + path = pathlib.Path(path) + tmp_path = path.parent / f".{os.getpid()}.{threading.get_ident()}.tmp" + write_mode = "w" if isinstance(content, str) else "wb" + with tmp_path.open(write_mode) as f: + f.write(content) + tmp_path.rename(path) + + +@dataclasses.dataclass +class TensorMetadataAndValues: + """ + TensorMetadata plus the elements as a list of raw values. + Used for hashing inlined constants. + """ + + tensor_metadata: TensorMetadata + values: List[Any] + + +def _ident(x: Any) -> Any: + return x + + +def _reduce_fake_tensor(t): + """ + See FxGraphCachePickler. Custom reducer to pickle FakeTensors. + """ + metadata = extract_tensor_metadata(t) + return (_ident, (metadata,)) + + +def _reduce_tensor(t): + """ + See FxGraphCachePickler. Custom reducer to pickle Tensors. + """ + if t.is_mkldnn: + # TODO: These tensors don't currently pickle, so we can't cache a + # compiled graph containing them. Just fail now. If mkldnn tensors + # get pickling support, we can remove this. + raise BypassFxGraphCache() + + # If we see tensors, we know they're constants stored as attributes on + # the GraphModule. See tensor lowering; small constants are inlined. If + # we see a small tensor, therefore, no reference will ultimately remain + # in the generated code. So we need to include its value in the cache key. + # Large constants are effectively treated as inputs and we consider only + # their metadata. + metadata = extract_tensor_metadata(t) + if len(t.shape) == 0 or torch._inductor.graph.GraphLowering.can_inline_constant(t): + return (_ident, (TensorMetadataAndValues(metadata, t.tolist()),)) + else: + return (_ident, (metadata,)) + + +def _reduce_symint(s): + """ + See FxGraphCachePickler. Custom reducer to pickle SymInts. + """ + # For hashing purposes, we only care about the name of the symbol and + # not the backed value. We evaluate guards stored with a cached graph + # to ensure a cached entity with SymInt args is safe to reuse. + return (_ident, (str(s),)) + + +class FxGraphCachePickler(pickle.Pickler): + """ + Custom pickler to customize the pickling of some objects (Tensors), only for the + purpose of computing a hash for keying into the FxGraphCache. Tensors contain + objects that don't pickle and/or vary between runs, and we want to capture the + data that allow us to compute a stable, but safe hash. + """ + + dispatch_table = copyreg.dispatch_table.copy() + dispatch_table[FakeTensor] = _reduce_fake_tensor + dispatch_table[torch.Tensor] = _reduce_tensor + dispatch_table[torch.SymInt] = _reduce_symint + + @staticmethod + def dumps(obj) -> bytes: + """ + Pickle an object using the FxGraphCachePickler. + """ + with io.BytesIO() as stream: + pickler = FxGraphCachePickler(stream) + pickler.dump(obj) + return stream.getvalue() + + @staticmethod + def get_hash(obj: Any) -> str: + """ + Serialize an object using the FxGraphCachePickler and return a hash + of the pickled object. + """ + serialized_data = FxGraphCachePickler.dumps(obj) + return sha256_hash(serialized_data) + + +@functools.lru_cache(None) +def get_inductor_code_hash() -> bytes: + """ + Compute a hash of all inductor code modules. Used by the FxGraph cache + so any inductor code changes would result in new cache keys. + """ + inductor_root = os.path.dirname(__file__) + + contents: Dict[str, bytes] = {} + for lib in pkgutil.iter_modules([inductor_root]): + spec = lib.module_finder.find_spec(lib.name, None) + assert spec is not None + module = spec.origin + assert module is not None + with open(module, "rb") as f: + contents[module] = f.read() + + return hashlib.sha256(pickle.dumps(contents)).digest() + + +@dataclasses.dataclass +class OrderedSetHolder: + """ + See FxGraphHashDetails. Holds a sorted list to support stable hashing + of set kwargs. + """ + + items: List[Any] + + +class BypassFxGraphCache(Exception): + """ + Exception to indicate that the FxGraphCache should be bypassed. + """ + + pass + + +class FxGraphHashDetails: + """ + Object to capture all the details for a compiled FX graph relevant to computing + a safe and stable cache key. + """ + + # Excluded kwargs param that are not stable between runs + EXCLUDED_KWARGS = ["graph_id"] + + def __init__( + self, + gm: torch.fx.GraphModule, + example_inputs: List[torch.Tensor], + fx_kwargs: Dict[str, Any], + ): + self.gm = gm + self.example_inputs = example_inputs + + # Order kwargs so hashing is stable to changes in kwarg order. + self.fx_kwargs = {} + for k in sorted(fx_kwargs): + if k not in self.EXCLUDED_KWARGS: + if type(fx_kwargs[k]) is set: + # Special case to handle set params. Python sets can't be + # ordered, so sort the elements and store them in a proxy. + self.fx_kwargs[k] = OrderedSetHolder(sorted(fx_kwargs[k])) + else: + self.fx_kwargs[k] = fx_kwargs[k] + + # 'Deterministic algorithms' can affect codegen via lowering to cuda kernels. + self.deterministic_algorithms_settings = ( + torch.are_deterministic_algorithms_enabled(), + torch.is_deterministic_algorithms_warn_only_enabled(), + torch.utils.deterministic.fill_uninitialized_memory, # type: ignore[attr-defined] + ) + + # Global settings affecting matmul codegen. + self.cuda_matmul_settings = ( + torch.backends.cuda.matmul.allow_tf32, + torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction, + torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction, + ) + + # Also hash on various system info (including the triton compiler version). + self.torch_version = torch.__version__ + self.system_info = CacheBase.get_system() + + # And the inductor configuration and code. + self.inductor_code_hash = get_inductor_code_hash() + try: + self.inductor_config = config.save_config() + except TypeError as e: + # Some configs options are callables, e.g., post_grad_custom_pre_pass, + # and may not pickle. + log.debug("Can't pickle inductor config: %s", e) + raise BypassFxGraphCache() from e + + def debug_str(self) -> str: + """ + Get a printable string describing in more detail all the attributes + comprising this object. Useful for debugging when one graph hashes + to a different value than another. + """ + + def get_str(obj) -> str: + if isinstance(obj, torch.Tensor): + return str(extract_tensor_metadata(obj)) + elif isinstance(obj, bytes): + return "" + else: + return str(obj) + + lines = [] + for attr, obj in vars(self).items(): + if isinstance(obj, list): + for ii in range(len(obj)): + h = FxGraphCachePickler.get_hash(obj[ii]) + lines.append(f"[{h}] {attr}[{ii}]: {get_str(obj[ii])}") + elif isinstance(obj, dict): + for k, v in obj.items(): + h = FxGraphCachePickler.get_hash(v) + lines.append(f"[{h}] {attr}[{k}]: {get_str(v)}") + else: + h = FxGraphCachePickler.get_hash(obj) + lines.append(f"[{h}] {attr}: {get_str(obj)}") + return "\n".join(lines) + + +def compiled_fx_graph_hash( + gm: torch.fx.GraphModule, + example_inputs: List[torch.Tensor], + fx_kwargs: Dict[str, Any], +) -> str: + """ + Generate a unique hash of the FX graph for caching. + """ + details = FxGraphHashDetails(gm, example_inputs, fx_kwargs) + # The prefix distinguishes among the other kinds of objects we + # cache in this module. + key = "f" + FxGraphCachePickler.get_hash(details) + log.debug("FX graph cache hash details for key %s:\n%s", key, details.debug_str()) + return key + + +class FxGraphCache: + """ + Supports caching and reusing compiled Fx graphs. + + The overall strategy is as follows: + - This cache stores entries on disk. When saving an entry, we can't + serialize callables (that could be C++, Triton, etc.), so we serialize + their own disk cache location. We then recreate the compiled artifact + after fetching from disk. + - For indexing the cache, we gather the fields relevant to identifying an + FxGraph (the graph module, graph inputs, system settings etc.) into an + FxGraphCacheDetails object, pickle it, and compute a hash for the key. + See FxGraphCachePickler. + - Among the metadata we store, we also include a guards expression that's + appropriate for validating any symbols for Tensor arguments that have + symbolic bounds. On cache lookup then, we evaluate those guards in the + current context to validate that a cached entry can be served. + - A given graph could have multiple compiled versions, corresponding to + different sets of guards. Therefore, we store cache entries in the form: + // + - On lookup, we compute the key from the graph details, iterate over all + leaf files in the corresponding subdirectory, deserialize the entry, and + evaluate its guards expression. If the evaluation succeeds, we have a + cache hit. If it fails, we compile the graph and store a new entry. + - Finally, on a cache hit, we need to make sure any guards that would + have been created during compilation are added to the current context. + """ + + # TODO(masnesral): Investigate whether it's beneficial to store compiled graphs + # in an in-memory cache after loading from disk. + @staticmethod + def _get_tmp_dir() -> str: + """ + Get the toplevel temporary directory for storing compiled graphs. + """ + return os.path.join(cache_dir(), "fxgraph") + + @staticmethod + def _get_tmp_dir_for_key(key: str) -> str: + """ + Return the disk location for a given cache key. + """ + return os.path.join(FxGraphCache._get_tmp_dir(), key[1:3], key) + + @staticmethod + def _filter_symints(inputs: List[Any]) -> List[torch.SymInt]: + """ + Get the SymInt objects from the input list. + """ + return [s for s in inputs if isinstance(s, torch.SymInt)] + + @staticmethod + def _get_shape_env() -> Optional[ShapeEnv]: + """ + Helper to get the shape env from the tracing context. + """ + ctx = torch._guards.TracingContext.try_get() + if not ctx: + return None + return ctx.fake_mode.shape_env + + @staticmethod + def _lookup_graph( + key: str, + example_inputs: List[torch.Tensor], + ) -> Optional[CompiledFxGraph]: + """ + Lookup a compiled graph in the cache by key. On a hit, return the + deserialized CompiledFxGraph object. On a miss, return None. + """ + subdir = FxGraphCache._get_tmp_dir_for_key(key) + if not os.path.exists(subdir): + return None + + shape_env = FxGraphCache._get_shape_env() + assert shape_env is not None + + # Iterate over any entries in the subdir for this key and evaluate + # their guards to determine whether there's a hit. + graph = None + + for path in sorted(os.listdir(subdir)): + with open(os.path.join(subdir, path), "rb") as f: + candidate: CompiledFxGraph = pickle.load(f) + + guards_expr = candidate.guards_expr + if not guards_expr: + # No guards to evaluate, so this is a hit. + graph = candidate + break + + # Evaluate the guard expression in the current context. + symints = FxGraphCache._filter_symints(example_inputs) + + # If there's not a cache hit, we don't want the evaluation to + # affect the current env, e.g., cause the creation of new guards, + # so we evaluate with the hints instead of the symbols. + assert all(has_hint(s) for s in symints) + hints = [hint_int(s) for s in symints] + hit = bool(shape_env.evaluate_guards_expression(guards_expr, hints)) + log.debug( + "fx graph cache key %s evaluating guards for %s with values %s => %s", + key, + guards_expr, + hints, + hit, + ) + if hit: + # Now re-evaluate with the symints to add any guards to the current env. + check = bool(shape_env.evaluate_guards_expression(guards_expr, symints)) + assert check is True + log.debug( + "fx graph cache key %s post-load guards: %s", key, shape_env.guards + ) + graph = candidate + break + + # Increment the cached metrics by the amounts recorded when the FX + # graph was compiled for this cache entry. Pretending these counters + # were incremented normally is useful for testing with the cache enabled. + if graph is not None: + metrics.CachedMetricsHelper.apply_deltas(graph.metrics_deltas) + + return graph + + @staticmethod + def _save_graph( + key: str, compiled_graph: CompiledFxGraph, example_inputs: List[torch.Tensor] + ): + """ + Store a serialized CompiledFxGraph on disk. + """ + disk_compiled_graph = copy(compiled_graph) + # Important as compiled models are not pickleable: + disk_compiled_graph.compiled_artifact = None + + # Before serializing, compute the guard expression that will be used to + # ensure that a CompiledFxGraph is valid when loaded from the cache. It's + # sufficient to consider only the SymInt args to the fx graph since the + # Tensor shapes are already captured in the hash for the cache key. Any + # Tensor arg with a symbolic shape will have a SymInt arg for the graph. + shape_env = FxGraphCache._get_shape_env() + assert shape_env is not None + symints = FxGraphCache._filter_symints(example_inputs) + disk_compiled_graph.guards_expr = shape_env.produce_guards_expression(symints) + + try: + content = pickle.dumps(disk_compiled_graph) + except Exception as e: + log.debug("fx graph cache unable to serialize compiled graph: %s", e) + counters["inductor"]["fxgraph_cache_pickle_error"] += 1 + return + + subdir = FxGraphCache._get_tmp_dir_for_key(key) + if not os.path.exists(subdir): + os.makedirs(subdir, exist_ok=True) + + # Use a hash of the serialized CompiledFxGraph to get a unique file + # name. The specific name doesn't matter since a lookup involves + # iterating over all entries in the parent subdir. + path = os.path.join(subdir, sha256_hash(content)) + write_atomic(path, content) + + @staticmethod + def _check_can_cache(): + """ + Check some conditions that would preclude caching and raise BypassFxGraphCache + to bypass in case caching is not possible. + """ + if config.freezing or config.aot_inductor.use_runtime_constant_folding: + # Freezing can embed constants that wouldn't be static across runs. + raise BypassFxGraphCache() + + if FxGraphCache._get_shape_env() is None: + # The treatment of guards in the caching implementation requires that + # we have a shape env. + log.debug("fx graph cache no shape env") + raise BypassFxGraphCache() + + @staticmethod + def load( + compile_fx_fn: Callable[..., Any], + gm: torch.fx.GraphModule, + example_inputs: List[torch.Tensor], + fx_kwargs: Dict[str, Any], + ): + """ + Load a compiled graph from the cache. If a cached entry does not exist, + compile the graph and save it to the cache. + """ + from filelock import FileLock + + compiled_graph = None + try: + FxGraphCache._check_can_cache() + key = compiled_fx_graph_hash(gm, example_inputs, fx_kwargs) + + lock_path = os.path.join(get_lock_dir(), key + ".lock") + with FileLock(lock_path, timeout=LOCK_TIMEOUT): + compiled_graph = FxGraphCache._lookup_graph(key, example_inputs) + if compiled_graph is None: + log.debug("fx graph cache miss for key %s", key) + counters["inductor"]["fxgraph_cache_miss"] += 1 + compiled_graph = compile_fx_fn(gm, example_inputs, **fx_kwargs) + FxGraphCache._save_graph(key, compiled_graph, example_inputs) + else: + log.debug("fx graph cache hit for key %s", key) + counters["inductor"]["fxgraph_cache_hit"] += 1 + except BypassFxGraphCache: + counters["inductor"]["fxgraph_cache_bypass"] += 1 + + if not compiled_graph: + compiled_graph = compile_fx_fn(gm, example_inputs, **fx_kwargs) + + return compiled_graph + + @staticmethod + def clear(): + """ + Clear out the on-disk cache. + """ + try: + shutil.rmtree(FxGraphCache._get_tmp_dir()) + except FileNotFoundError: + pass + + +@dataclasses.dataclass +class CompiledFxGraph: + """ + Class holding a compiled FX graph. This is the object serialized on disk + to support FxGraph caching. + """ + + compiled_artifact: Optional[Callable[..., Any]] + current_callable: Optional[Callable[..., Any]] + cache_key: Optional[str] + artifact_path: Optional[str] + cache_linemap: Optional[List[Tuple[int, str]]] + device_types: Set[str] + device_idxs: Set[int] + mutated_inputs: Set[str] + mutated_input_idxs: Set[int] + constants: Dict[str, torch.Tensor] + output_strides: Optional[List[Optional[Tuple[int, ...]]]] + disabled_cudagraphs_reason: Optional[str] + metrics_deltas: metrics.CachedMetricsDeltas + # This is a string representation of an expression we serialize + # with the object so the guards can be evaluated in a different + # context in order to verify the validity of serving a cached + # fx graph. The expression must be generated by: + # ShapeEnv.produce_guards_expression() + guards_expr: Optional[str] + + _boxed_call: Optional[bool] = None + + def __init__( + self, + compiled_artifact: Optional[Callable[..., Any]], + graph: GraphLowering, + output_strides: List[Optional[Tuple[int, ...]]], + disabled_cudagraphs_reason: Optional[str], + metrics_deltas: metrics.CachedMetricsDeltas, + ): + self.compiled_artifact = compiled_artifact + self.current_callable = None + self.cache_key = graph.cache_key + self.artifact_path = graph.cache_path + self.cache_linemap = graph.cache_linemap + self.device_types = graph.device_types + self.device_idxs = graph.device_idxs + self.mutated_inputs = graph.mutated_inputs + self.mutated_input_idxs = set(graph.mutated_input_idxs) + self.constants = graph.constants + self.output_strides = output_strides + self.disabled_cudagraphs_reason = disabled_cudagraphs_reason + self.metrics_deltas = metrics_deltas + self.guards_expr = None + + def __call__(self, inputs: List[Any]) -> Any: + return self.get_current_callable()(inputs) + + def get_current_callable(self) -> Callable[..., Any]: + if self.current_callable is None: + # This prevents a circular reference that makes CompiledFxGraph + # get stuck without getting garbage collected + return functools.partial(_run_from_cache, weakref.proxy(self)) + else: + return self.current_callable + + +def _run_from_cache(compiled_graph: CompiledFxGraph, inputs: List[Any]) -> Any: + # We can't really serialize callables that may be C++/Triton/etc., + # so we serialize their disk cache location instead + # TODO: When making an API that can save compiled models e2e to disk + # this will need to be better + if compiled_graph.compiled_artifact is None: + from .codecache import PyCodeCache + + assert compiled_graph.cache_key + assert compiled_graph.artifact_path + compiled_graph.compiled_artifact = PyCodeCache.load_by_key_path( + compiled_graph.cache_key, + compiled_graph.artifact_path, + compiled_graph.cache_linemap, + compiled_graph.constants, + ).call + + return compiled_graph.compiled_artifact(inputs) + + +def cpp_compiler() -> str: + if config.is_fbcode(): + return build_paths.cc() + if isinstance(config.cpp.cxx, (list, tuple)): + search = tuple(config.cpp.cxx) + else: + search = (config.cpp.cxx,) + return cpp_compiler_search(search) + + +@functools.lru_cache(1) +def cpp_compiler_search(search: str) -> str: + for cxx in search: + try: + if cxx is None: + # gxx package is only available for Linux + # according to https://anaconda.org/conda-forge/gxx/ + if sys.platform != "linux": + continue + # Do not install GXX by default + if not os.getenv("TORCH_INDUCTOR_INSTALL_GXX"): + continue + from filelock import FileLock + + lock_dir = get_lock_dir() + lock = FileLock( + os.path.join(lock_dir, "g++.lock"), timeout=LOCK_TIMEOUT + ) + with lock: + cxx = install_gcc_via_conda() + subprocess.check_output([cxx, "--version"]) + return cxx + except (subprocess.SubprocessError, FileNotFoundError, ImportError): + continue + raise exc.InvalidCxxCompiler() + + +def install_gcc_via_conda() -> str: + """On older systems, this is a quick way to get a modern compiler""" + prefix = os.path.join(cache_dir(), "gcc") + cxx_path = os.path.join(prefix, "bin", "g++") + if not os.path.exists(cxx_path): + log.info("Downloading GCC via conda") + conda = os.environ.get("CONDA_EXE", "conda") + if conda is None: + conda = shutil.which("conda") + if conda is not None: + subprocess.check_call( + [ + conda, + "create", + f"--prefix={prefix}", + "--channel=conda-forge", + "--quiet", + "-y", + "python=3.8", + "gxx", + ], + stdout=subprocess.PIPE, + ) + return cxx_path + + +def is_gcc() -> bool: + return bool(re.search(r"(gcc|g\+\+)", cpp_compiler())) + + +def is_clang() -> bool: + return bool(re.search(r"(clang|clang\+\+)", cpp_compiler())) + + +@functools.lru_cache(None) +def is_apple_clang() -> bool: + cxx = cpp_compiler() + version_string = subprocess.check_output([cxx, "--version"]).decode("utf8") + return "Apple" in version_string.splitlines()[0] + + +class VecISA: + _bit_width: int + _macro: str + _arch_flags: str + _dtype_nelements: Dict[torch.dtype, int] + + # Note [Checking for Vectorized Support in Inductor] + # TorchInductor CPU vectorization reuses PyTorch vectorization utility functions + # Hence, TorchInductor would depend on Sleef* to accelerate mathematical functions + # like exp, pow, sin, cos and etc. + # But PyTorch and TorchInductor might use different compilers to build code. If + # PyTorch uses gcc-7/g++-7 to build the release package, the libtorch_cpu.so + # will not expose the Sleef* AVX512 symbols since gcc-7/g++-7 cannot pass + # avx512 check in CMake - FindAVX.cmake. But TorchInductor install the latest + # gcc/g++ compiler by default while it could support the AVX512 compilation. + # Therefore, there would be a conflict sleef version between PyTorch and + # TorchInductor. Hence, we dry-compile the following code to check whether current + # HW platform and PyTorch both could support AVX512 or AVX2. And suppose ARM + # also needs the logic + # In fbcode however, we are using the same compiler for pytorch and for inductor codegen, + # making the runtime check unnecessary. + _avx_code = """ +#if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR) +#include +#include +#endif + +__attribute__((aligned(64))) float in_out_ptr0[16] = {0.0}; + +extern "C" void __avx_chk_kernel() { + auto tmp0 = at::vec::Vectorized(1); + auto tmp1 = tmp0.exp(); + tmp1.store(in_out_ptr0); +} +""" # noqa: B950 + + _avx_py_load = """ +import torch +from ctypes import cdll +cdll.LoadLibrary("__lib_path__") +""" + + def bit_width(self) -> int: + return self._bit_width + + def nelements(self, dtype: torch.dtype = torch.float) -> int: + return self._dtype_nelements[dtype] + + def build_macro(self) -> str: + return self._macro + + def build_arch_flags(self) -> str: + return self._arch_flags + + def __hash__(self) -> int: + return hash(str(self)) + + @functools.lru_cache(None) + def __bool__(self) -> bool: + if config.cpp.vec_isa_ok is not None: + return config.cpp.vec_isa_ok + + if config.is_fbcode(): + return True + + key, input_path = write(VecISA._avx_code, "cpp") + from filelock import FileLock + + lock_dir = get_lock_dir() + lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT) + with lock: + output_path = input_path[:-3] + "so" + build_cmd = shlex.split( + cpp_compile_command( + input_path, output_path, warning_all=False, vec_isa=self + ) + ) + try: + # Check build result + compile_file(input_path, output_path, build_cmd) + subprocess.check_call( + [ + sys.executable, + "-c", + VecISA._avx_py_load.replace("__lib_path__", output_path), + ], + stderr=subprocess.DEVNULL, + env={**os.environ, "PYTHONPATH": ":".join(sys.path)}, + ) + except Exception as e: + return False + + return True + + +@dataclasses.dataclass +class VecAVX512(VecISA): + _bit_width = 512 + _macro = "-DCPU_CAPABILITY_AVX512" + _arch_flags = "-mavx512f -mavx512dq -mavx512vl -mavx512bw -mfma" + _dtype_nelements = {torch.float: 16, torch.bfloat16: 32, torch.float16: 32} + + def __str__(self) -> str: + return "avx512" + + __hash__: Callable[[VecISA], Any] = VecISA.__hash__ + + +@dataclasses.dataclass +class VecAVX2(VecISA): + _bit_width = 256 + _macro = "-DCPU_CAPABILITY_AVX2" + _arch_flags = "-mavx2 -mfma" + _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16} + + def __str__(self) -> str: + return "avx2" + + __hash__: Callable[[VecISA], Any] = VecISA.__hash__ + + +@dataclasses.dataclass +class VecZVECTOR(VecISA): + _bit_width = 256 + _macro = "-DCPU_CAPABILITY_ZVECTOR -DCPU_CAPABILITY=ZVECTOR -DHAVE_ZVECTOR_CPU_DEFINITION" + _arch_flags = "-mvx -mzvector" + _dtype_nelements = {torch.float: 8, torch.bfloat16: 16, torch.float16: 16} + + def __str__(self) -> str: + return "zvector" + + __hash__: Callable[[VecISA], Any] = VecISA.__hash__ + + +class InvalidVecISA(VecISA): + _bit_width = 0 + _macro = "" + _arch_flags = "" + _dtype_nelements = {} + + def __str__(self) -> str: + return "INVALID_VEC_ISA" + + def __bool__(self) -> bool: # type: ignore[override] + return False + + __hash__: Callable[[VecISA], Any] = VecISA.__hash__ + + +invalid_vec_isa = InvalidVecISA() +supported_vec_isa_list = [VecAVX512(), VecAVX2()] + + +# Cache the cpuinfo to avoid I/O overhead. Meanwhile, the cpuinfo content +# might have too much redundant content that is useless for ISA check. Hence, +# we only cache some key isa information. +@functools.lru_cache(None) +def valid_vec_isa_list() -> List[VecISA]: + if sys.platform != "linux": + return [] + + if platform.machine() == "s390x": + return [VecZVECTOR()] + + isa_list = [] + with open("/proc/cpuinfo") as _cpu_info: + _cpu_info_content = _cpu_info.read() + for isa in supported_vec_isa_list: + if str(isa) in _cpu_info_content and isa: + isa_list.append(isa) + return isa_list + + +def pick_vec_isa() -> VecISA: + if config.is_fbcode(): + return VecAVX2() + + _valid_vec_isa_list: List[VecISA] = valid_vec_isa_list() + if not _valid_vec_isa_list: + return invalid_vec_isa + + # If the simdlen is None, it indicates determin the vectorization length automatically + if config.cpp.simdlen is None: + assert _valid_vec_isa_list + return _valid_vec_isa_list[0] + + for isa in _valid_vec_isa_list: + if config.cpp.simdlen == isa.bit_width(): + return isa + + return invalid_vec_isa + + +def get_compile_only(compile_only: bool = True) -> str: + return "-c" if compile_only else "" + + +def get_shared(shared: bool = True, compile_only: bool = False) -> str: + if not shared: + return "" + if compile_only: + return "-fPIC" + if platform.system() == "Darwin" and "clang" in cpp_compiler(): + # This causes undefined symbols to behave the same as linux + return "-shared -fPIC -undefined dynamic_lookup" + else: + return "-shared -fPIC" + + +def get_warning_all_flag(warning_all: bool = True) -> str: + return "-Wall" if warning_all else "" + + +def get_glibcxx_abi_build_flags() -> str: + return "-D_GLIBCXX_USE_CXX11_ABI=" + str(int(torch._C._GLIBCXX_USE_CXX11_ABI)) + + +def cpp_flags() -> str: + flags = ["-std=c++17", "-Wno-unused-variable", "-Wno-unknown-pragmas"] + if is_clang(): + flags.append("-Werror=ignored-optimization-argument") + return " ".join(flags) + + +def cpp_wrapper_flags() -> str: + return "-DTORCH_INDUCTOR_CPP_WRAPPER" + + +def optimization_flags() -> str: + base_flags = "-O0 -g" if config.aot_inductor.debug_compile else "-O3 -DNDEBUG" + base_flags += " -ffast-math -fno-finite-math-only" + if not config.cpp.enable_unsafe_math_opt_flag: + base_flags += " -fno-unsafe-math-optimizations" + if not config.cpp.enable_floating_point_contract_flag: + base_flags += " -ffp-contract=off" + + if config.is_fbcode(): + # FIXME: passing `-fopenmp` adds libgomp.so to the generated shared library's dependencies. + # This causes `ldopen` to fail in fbcode, because libgomp does not exist in the default paths. + # We will fix it later by exposing the lib path. + return base_flags + + if sys.platform == "darwin": + # Per https://mac.r-project.org/openmp/ right way to pass `openmp` flags to MacOS is via `-Xclang` + # Also, `-march=native` is unrecognized option on M1 + base_flags += " -Xclang" + else: + if platform.machine() == "ppc64le": + base_flags += " -mcpu=native" + else: + base_flags += " -march=native" + + # Internal cannot find libgomp.so + if not config.is_fbcode(): + base_flags += " -fopenmp" + return base_flags + + +def use_custom_generated_macros() -> str: + return "-D C10_USING_CUSTOM_GENERATED_MACROS" + + +def use_fb_internal_macros() -> str: + if config.is_fbcode(): + openmp_lib = build_paths.openmp_lib() + preprocessor_flags = " ".join( + ( + "-D C10_USE_GLOG", + "-D C10_USE_MINIMAL_GLOG", + "-D C10_DISABLE_TENSORIMPL_EXTENSIBILITY", + ) + ) + return f"-Wp,-fopenmp {openmp_lib} {preprocessor_flags}" + else: + return "" + + +def use_standard_sys_dir_headers() -> str: + if config.is_fbcode(): + return "-nostdinc" + else: + return "" + + +@functools.lru_cache(None) +def is_conda_llvm_openmp_installed() -> bool: + try: + command = "conda list llvm-openmp --json" + output = subprocess.check_output(command.split()).decode("utf8") + return len(json.loads(output)) > 0 + except subprocess.SubprocessError: + return False + + +@functools.lru_cache(None) +def homebrew_libomp() -> Tuple[bool, str]: + try: + # check if `brew` is installed + subprocess.check_output(["which", "brew"]) + # get the location of `libomp` if it is installed + # this is the location that `libomp` **would** be installed + # see https://github.com/Homebrew/brew/issues/10261#issuecomment-756563567 for details + libomp_path = ( + subprocess.check_output(["brew", "--prefix", "libomp"]) + .decode("utf8") + .strip() + ) + # check if `libomp` is installed + omp_available = os.path.exists(libomp_path) + return omp_available, libomp_path + except subprocess.SubprocessError: + return False, "" + + +def get_include_and_linking_paths( + include_pytorch: bool = False, + vec_isa: VecISA = invalid_vec_isa, + cuda: bool = False, + aot_mode: bool = False, +) -> Tuple[List[str], str, str, str, str]: + if ( + config.is_fbcode() + and "CUDA_HOME" not in os.environ + and "CUDA_PATH" not in os.environ + ): + os.environ["CUDA_HOME"] = os.path.dirname(build_paths.cuda()) + from torch.utils import cpp_extension + + macros = "" + build_arch_flags = "" + if sys.platform == "linux" and ( + include_pytorch + or vec_isa != invalid_vec_isa + or cuda + or config.cpp.enable_kernel_profile + ): + # Note - We include pytorch only on linux right now. There is more work + # to do to enable OMP build on darwin where PyTorch is built with IOMP + # and we need a way to link to what PyTorch links. + ipaths = cpp_extension.include_paths(cuda) + [sysconfig.get_path("include")] + lpaths = cpp_extension.library_paths(cuda) + [ + sysconfig.get_config_var("LIBDIR") + ] + + libs = [] + + # No need to manually specify libraries in fbcode. + if not config.is_fbcode(): + libs += ["torch", "torch_cpu"] + libs += ["gomp"] + if not aot_mode: + libs += ["torch_python"] + else: + # internal remote execution is able to find omp, but not gomp + libs += ["omp"] + if aot_mode: + ipaths += [os.path.dirname(cpp_prefix_path())] + if cuda: + # This is a special treatment for Meta internal cuda-12 where all libs + # are in lib/cuda-12 and lib/cuda-12/stubs + for i, path in enumerate(lpaths): + if path.startswith( + os.environ["CUDA_HOME"] + ) and not os.path.exists(f"{path}/libcudart_static.a"): + for root, dirs, files in os.walk(path): + if "libcudart_static.a" in files: + lpaths[i] = os.path.join(path, root) + lpaths.append(os.path.join(lpaths[i], "stubs")) + break + macros = vec_isa.build_macro() + if macros: + if config.is_fbcode() and vec_isa != invalid_vec_isa: + cap = str(vec_isa).upper() + macros = " ".join( + [ + vec_isa.build_arch_flags(), + f"-D CPU_CAPABILITY={cap}", + f"-D CPU_CAPABILITY_{cap}", + f"-D HAVE_{cap}_CPU_DEFINITION", + ] + ) + + if cuda: + if macros is None: + macros = "" + macros += " -D USE_ROCM" if torch.version.hip else " -D USE_CUDA" + + if cuda: + if torch.version.hip is not None: + libs += ["c10_hip", "torch_hip"] + macros += " -D __HIP_PLATFORM_AMD__" + else: + if config.is_fbcode(): + libs += ["cuda"] + else: + libs += ["c10_cuda", "cuda", "torch_cuda"] + build_arch_flags = vec_isa.build_arch_flags() + else: + # Note - this is effectively a header only inclusion. Usage of some header files may result in + # symbol not found, if those header files require a library. + # For those cases, include the lpath and libs command as we do for pytorch above. + # This approach allows us to only pay for what we use. + ipaths = cpp_extension.include_paths(cuda) + [sysconfig.get_path("include")] + if aot_mode: + ipaths += [os.path.dirname(cpp_prefix_path())] + lpaths = [] + if sys.platform == "darwin": + # only Apple builtin compilers (Apple Clang++) require openmp + omp_available = not is_apple_clang() + + # check the `OMP_PREFIX` environment first + if os.getenv("OMP_PREFIX") is not None: + header_path = os.path.join(os.getenv("OMP_PREFIX"), "include", "omp.h") # type: ignore[arg-type] + valid_env = os.path.exists(header_path) + if valid_env: + ipaths.append(os.path.join(os.getenv("OMP_PREFIX"), "include")) # type: ignore[arg-type] + lpaths.append(os.path.join(os.getenv("OMP_PREFIX"), "lib")) # type: ignore[arg-type] + else: + warnings.warn("environment variable `OMP_PREFIX` is invalid.") + omp_available = omp_available or valid_env + + libs = [] if omp_available else ["omp"] + + # prefer to use openmp from `conda install llvm-openmp` + if not omp_available and os.getenv("CONDA_PREFIX") is not None: + omp_available = is_conda_llvm_openmp_installed() + if omp_available: + conda_lib_path = os.path.join(os.getenv("CONDA_PREFIX"), "lib") # type: ignore[arg-type] + ipaths.append(os.path.join(os.getenv("CONDA_PREFIX"), "include")) # type: ignore[arg-type] + lpaths.append(conda_lib_path) + # Prefer Intel OpenMP on x86 machine + if os.uname().machine == "x86_64" and os.path.exists( + os.path.join(conda_lib_path, "libiomp5.dylib") + ): + libs = ["iomp5"] + + # next, try to use openmp from `brew install libomp` + if not omp_available: + omp_available, libomp_path = homebrew_libomp() + if omp_available: + ipaths.append(os.path.join(libomp_path, "include")) + lpaths.append(os.path.join(libomp_path, "lib")) + + # if openmp is still not available, we let the compiler to have a try, + # and raise error together with instructions at compilation error later + else: + libs = ["omp"] if config.is_fbcode() else ["gomp"] + + # Unconditionally import c10 for non-abi-compatible mode to use TORCH_CHECK - See PyTorch #108690 + if not config.abi_compatible: + libs += ["c10"] + lpaths += [cpp_extension.TORCH_LIB_PATH] + + # third party libs + if config.is_fbcode(): + ipaths.append(build_paths.sleef()) + ipaths.append(build_paths.openmp()) + ipaths.append(build_paths.cc_include()) + ipaths.append(build_paths.libgcc()) + ipaths.append(build_paths.libgcc_arch()) + ipaths.append(build_paths.libgcc_backward()) + ipaths.append(build_paths.glibc()) + ipaths.append(build_paths.linux_kernel()) + ipaths.append(build_paths.cuda()) + # We also need to bundle includes with absolute paths into a remote directory + # (later on, we copy the include paths from cpp_extensions into our remote dir) + ipaths.append("include") + + static_link_libs = [] + if aot_mode and cuda and config.is_fbcode(): + # For Meta internal cuda-12, it is recommended to static link cudart + static_link_libs = ["-Wl,-Bstatic", "-lcudart_static", "-Wl,-Bdynamic"] + + lpaths_str = " ".join(["-L" + p for p in lpaths]) + libs_str = " ".join(static_link_libs + ["-l" + p for p in libs]) + return ipaths, lpaths_str, libs_str, macros, build_arch_flags + + +def cpp_compile_command( + input: Union[str, List[str]], + output: str, + warning_all: bool = True, + shared: bool = True, + include_pytorch: bool = False, + vec_isa: VecISA = invalid_vec_isa, + cuda: bool = False, + aot_mode: bool = False, + compile_only: bool = False, + use_absolute_path: bool = False, +) -> str: + ipaths, lpaths, libs, macros, build_arch_flags = get_include_and_linking_paths( + include_pytorch, vec_isa, cuda, aot_mode + ) + if isinstance(input, str): + input = [input] + ipaths_str = " ".join(["-I" + p for p in ipaths]) + clang_flags = "" + if config.is_fbcode(): + if aot_mode and not use_absolute_path: + inp_name = input + out_name = output + linker_script = _LINKER_SCRIPT + else: + # We need to copy any absolute-path torch includes + inp_name = [os.path.basename(i) for i in input] + out_name = os.path.basename(output) + linker_script = os.path.basename(_LINKER_SCRIPT) + assert is_clang() + # Use clang runtime instead of libgcc + clang_flags += " --rtlib=compiler-rt" + clang_flags += " -fuse-ld=lld" + clang_flags += f" -Wl,--script={linker_script}" + linker_paths = "-B" + build_paths.glibc_lib() + linker_paths += " -L" + build_paths.glibc_lib() + else: + inp_name = input + out_name = output + linker_paths = "" # let the compiler pick + if compile_only: + libs, lpaths = "", "" + inp_name_str = " ".join(inp_name) + return re.sub( + r"[ \n]+", + " ", + f""" + {cpp_compiler()} {inp_name_str} {get_shared(shared, compile_only)} + {get_warning_all_flag(warning_all)} {cpp_flags()} + {get_glibcxx_abi_build_flags()} + {ipaths_str} {lpaths} {libs} {build_arch_flags} + {macros} {linker_paths} {clang_flags} + {optimization_flags()} + {use_custom_generated_macros()} + {use_fb_internal_macros()} + {use_standard_sys_dir_headers()} + {get_compile_only(compile_only)} + -o {out_name} + """, + ).strip() + + +def run_command_and_check(cmd: str): + cmd = shlex.split(cmd) + try: + subprocess.check_call(cmd) + except subprocess.CalledProcessError as e: + raise exc.CppCompileError(cmd, e.output) from e + + +@functools.lru_cache(None) +def split_aot_inductor_output_path(path: str) -> Tuple[str, str]: + """Returns the path where the AOT Inductor compiled kernels are stored.""" + if path.endswith(".so"): + return os.path.split(path) + else: + return path, "" + + +class CudaKernelParamCache: + cache: Dict[str, Dict[str, str]] = dict() + clear = staticmethod(cache.clear) + + @classmethod + def set(cls, key: str, params: Dict[str, str], cubin: str) -> None: + bin_type = "cubin" if torch.version.hip is None else "hsaco" + _, path = write( + cubin, + bin_type, + hash_type=bin_type, + specified_dir=split_aot_inductor_output_path( + config.aot_inductor.output_path + )[0], + ) + + params[get_cpp_wrapper_cubin_path_name()] = path + + cls.cache[key] = params + + @classmethod + def get(cls, key: str) -> Optional[Dict[str, str]]: + return cls.cache.get(key, None) + + @classmethod + def get_keys(cls): + return cls.cache.keys() + + +class AotCodeCompiler: + @classmethod + def compile( + cls, + graph: GraphLowering, + source_code: str, + serialized_extern_kernel_nodes: Optional[str], + cuda: bool, + ) -> str: + picked_vec_isa = pick_vec_isa() + cpp_command = repr( + cpp_compile_command( + "i", "o", vec_isa=picked_vec_isa, cuda=cuda, aot_mode=graph.aot_mode + ) + ) + fbcode_aot_cpu_re = False + use_absolute_path = False + if config.is_fbcode(): + ld_command = build_paths.ld() + if not cuda and graph.aot_mode: # Meta internal AOTInductor CPU + objcopy_command = build_paths.objcopy_fallback() + fbcode_aot_cpu_re = True + use_absolute_path = True + else: + objcopy_command = build_paths.objcopy() + else: + ld_command = "ld" + objcopy_command = "objcopy" + + ( + specified_output_path, + specified_so_name, + ) = split_aot_inductor_output_path(config.aot_inductor.output_path) + key, input_path = write( + source_code, + "cpp", + extra=cpp_command, + specified_dir=specified_output_path, + ) + + def _compile_consts_linux(consts: bytes) -> str: + _, consts_path = write( + consts, + "bin", + specified_dir=specified_output_path, + ) + + consts_o = os.path.splitext(consts_path)[0] + ".o" + if fbcode_aot_cpu_re: + cmd = f"{ld_command} -r -b binary -o {os.path.basename(consts_o)} {os.path.basename(consts_path)}" + compile_file(consts_path, consts_o, cmd.split()) + os.chmod(consts_o, 0o644) + else: + cmd = f"{ld_command} -r -b binary -o {consts_o} {consts_path}" + run_command_and_check(cmd) + log.debug("aot constant binary command: %s", cmd) + + cmd = ( + f"{objcopy_command} --rename-section" + " .data=.lrodata,alloc,load,readonly,data,contents" + f" {consts_o} {consts_o}" + ) + log.debug("aot constant obj command: %s", cmd) + run_command_and_check(cmd) + + cmd = f"rm {consts_path}" + log.debug("aot constant bin removal command: %s", cmd) + run_command_and_check(cmd) + + if fbcode_aot_cpu_re: + body = re.sub(r"[\W]", "_", os.path.basename(consts_path)) + else: + body = re.sub(r"[\W]", "_", consts_path) + + symbol_list = [] + symbol_list.append( + f"{objcopy_command} --redefine-sym _binary_{body}_start=_binary_constants_bin_start {consts_o}" + ) + symbol_list.append( + f"{objcopy_command} --redefine-sym _binary_{body}_size=_binary_constants_bin_size {consts_o}" + ) + symbol_list.append( + f"{objcopy_command} --redefine-sym _binary_{body}_end=_binary_constants_bin_end {consts_o}" + ) + log.debug("aot constant binary redefine symbol: %s", " ".join(symbol_list)) + for cmd in symbol_list: + run_command_and_check(cmd) + return consts_o + + def _compile_consts_darwin(consts: bytes) -> str: + is_large_consts = len(consts) > 1024 + consts_asm = "\t.section\t__TEXT,__const\n" + consts_asm += "\t.globl\t__binary_constants_bin_start\n" + consts_asm += "__binary_constants_bin_start:\n" + if not is_large_consts: + for c in consts: + consts_asm += f"\t.byte {c}\n" + # Add one element even if constants are empty + # Otherwise assembler will not put them in data section + if not consts: + consts_asm += "\t.space 1\n" + else: + consts_asm += "\t.quad 0x1234567899abcdef\n" + consts_asm += f"\t.space {len(consts) - 8}\n" + consts_asm += ".globl\t__binary_constants_bin_end\n" + consts_asm += "__binary_constants_bin_end:\n" + _, consts_path = write( + consts_asm, + "S", + specified_dir=specified_output_path, + ) + consts_o = os.path.splitext(consts_path)[0] + ".o" + cmd = f"{cpp_compiler()} -c -o {consts_o} {consts_path}" + run_command_and_check(cmd) + if is_large_consts: + with open(consts_o, "r+b") as f: + f.seek(0) + hdr = f.read(1024) + # Search for magic number and write the actual data over it + start_idx = hdr.find(b"\xef\xcd\xab\x99\x78\x56\x34\x12") + assert start_idx != -1 + f.seek(start_idx) + pos = 0 + while pos < len(consts): + rc = f.write(consts[pos:]) + pos += rc + return consts_o + + from filelock import FileLock + + lock_dir = get_lock_dir() + lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT) + with lock: + # Currently, this only support serializing extern nodes in fbcode + # Eventually, we should also have a serializer for OSS. + if config.is_fbcode() and serialized_extern_kernel_nodes: + output_json = os.path.splitext(input_path)[0] + ".json" + with open(output_json, "w") as f: + f.write(serialized_extern_kernel_nodes) + + output_so = ( + config.aot_inductor.output_path + if specified_so_name + else os.path.splitext(input_path)[0] + ".so" + ) + + output_o = os.path.splitext(input_path)[0] + ".o" + cmd = cpp_compile_command( + input=input_path, + output=output_o, + vec_isa=picked_vec_isa, + cuda=cuda, + aot_mode=graph.aot_mode, + compile_only=True, + use_absolute_path=use_absolute_path, + ) + log.debug("aot compilation command: %s", cmd) + if fbcode_aot_cpu_re: + compile_file(input_path, output_o, cmd.split()) + os.chmod(output_o, 0o644) + else: + run_command_and_check(cmd) + + def _to_bytes(t: torch.Tensor) -> bytes: + # This serializes the tensor's untyped_storage to bytes by accessing + # the raw data of the underlying structure. + import ctypes + + if t.numel() == 0: + return b"" + + t_cpu = t.untyped_storage().cpu() + raw_array = ctypes.cast( + t_cpu.data_ptr(), + ctypes.POINTER(ctypes.c_ubyte * t_cpu.nbytes()), + ) + + return bytes(raw_array.contents) + + aot_constants = b"".join( + _to_bytes(tensor) + for name, tensor in graph.constants.items() + if name not in graph.folded_constants + ) + consts_o = { + "linux": _compile_consts_linux, + "darwin": _compile_consts_darwin, + }[sys.platform](aot_constants) + + cmd = cpp_compile_command( + input=[output_o, consts_o], + output=output_so, + vec_isa=picked_vec_isa, + cuda=cuda, + aot_mode=graph.aot_mode, + use_absolute_path=use_absolute_path, + ) + log.debug("aot linkage command: %s", cmd) + if fbcode_aot_cpu_re: + compile_file([output_o, consts_o], output_so, cmd.split()) + os.chmod(output_so, 0o755) + else: + run_command_and_check(cmd) + + return output_so + + +# Putting this fn in cpp.py (unfortunately) causes a deadlock, which is why it's in codecache.py. +# Why? importing from cpp.py invokes codecache.pick_vec_isa(), which takes out a lock. +# Cycle goes: +# - CppCodeCache.load() +# - pick_vec_isa() +# - valid_vec_isa_list() +# - VecISA.__bool__() <-- takes out a lock +# - compile_file() <-- imports cpp_prefix_path from cpp, which causes us to try to take out the same lock. +@functools.lru_cache +def cpp_prefix_path() -> str: + path = Path(__file__).parent / "codegen/cpp_prefix.h" + with path.open() as f: + content = f.read() + _, filename = write( + content, + "h", + ) + return filename + + +def cpp_prefix() -> str: + filename = cpp_prefix_path() + if config.is_fbcode(): + # We need relative paths, since we bundle up + # everything that we compile into a folder for remote compilation. + return f'#include "{os.path.basename(filename)}"' + else: + return f'#include "{filename}"' + + +# Given a path to an input cpp file and an output path, +# Attempts to compile the file, storing the output in "output_path" +@dynamo_timed +def compile_file( + input_path: Union[str, List[str]], output_path: str, cmd: List[str] +) -> None: + input_paths = [input_path] if isinstance(input_path, str) else input_path + input_files = [ + os.path.basename(ip) if config.is_fbcode() else ip for ip in input_paths + ] + try: + if config.is_fbcode(): + # Need to copy our header into the same folder as the sourcecode. + header_path = cpp_prefix_path() + header_name = os.path.basename(header_path) + output_name = os.path.basename(output_path) + # When we build remotely, we need to make sure to carefully copy any files + # that are required during the compilation process into our build directly. + # This is where all of the ATen/c10/Torch includes come from. + torch_includes_path = os.path.join(_TORCH_PATH, "include") + with tempfile.TemporaryDirectory() as tmp_dir: + # Copy everything to tmp compilation folder + shutil.copy(header_path, os.path.join(tmp_dir, header_name)) + shutil.copy(_LINKER_SCRIPT, os.path.join(tmp_dir, "script.ld")) + for p, f in zip(input_paths, input_files): + shutil.copy(p, os.path.join(tmp_dir, f)) + dest_include_path = os.path.join(tmp_dir, "include") + shutil.copytree(torch_includes_path, dest_include_path) + # Run the build + output_file_path = _run_build_command(cmd, tmp_dir, output_name) + # Copy output from the build + if os.path.exists(output_path): + os.remove(output_path) + shutil.copy(output_file_path, output_path) + else: + subprocess.check_output(cmd, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + output = e.output.decode("utf-8") + openmp_problem = "'omp.h' file not found" in output or "libomp" in output + if openmp_problem and sys.platform == "darwin": + instruction = ( + "\n\nOpenMP support not found. Please try one of the following solutions:\n" + "(1) Set the `CXX` environment variable to a compiler other than Apple clang++/g++ " + "that has builtin OpenMP support;\n" + "(2) install OpenMP via conda: `conda install llvm-openmp`;\n" + "(3) install libomp via brew: `brew install libomp`;\n" + "(4) manually setup OpenMP and set the `OMP_PREFIX` environment variable to point to a path" + " with `include/omp.h` under it." + ) + output += instruction + raise exc.CppCompileError(cmd, output) from e + + +_libgomp: Optional[CDLL] = None + + +class CppCodeCache: + cache: Dict[str, Union[CDLL, ModuleType]] = {} + clear = staticmethod(cache.clear) + cpp_compile_command_flags: Dict[str, Any] = {} + + @staticmethod + def _load_library_inner(path: str, key: str) -> Union[CDLL, ModuleType]: + return cdll.LoadLibrary(path) + + @classmethod + def _load_library(cls, path: str, key: str) -> Union[CDLL, ModuleType]: + try: + return cls._load_library_inner(path, key) + except (ImportError, OSError) as e: + if "gomp" in str(e) and os.path.exists("/usr/lib64/libgomp.so.1"): + # hacky workaround for fbcode/buck + global _libgomp + _libgomp = cdll.LoadLibrary("/usr/lib64/libgomp.so.1") + return cls._load_library_inner(path, key) + if "failed to map segment from shared object" in str(e): + raise OSError( + f"{e}. The most common reason this may occur is if the {tempfile.gettempdir()} folder " + "is mounted with noexec (e.g., by default Docker mounts tmp file systems " + f"as noexec). Please remount {tempfile.gettempdir()} with exec enabled, or set another " + "temporary directory with TORCHINDUCTOR_CACHE_DIR environment variable." + ) from e + raise + + @classmethod + def load(cls, source_code: str, cuda: bool = False) -> Union[CDLL, ModuleType]: + cls.cpp_compile_command_flags.update({"cuda": cuda}) + picked_vec_isa = pick_vec_isa() + cpp_command = repr( + cpp_compile_command( + "i", "o", vec_isa=picked_vec_isa, **cls.cpp_compile_command_flags + ) + ) + key, input_path = write(source_code, "cpp", extra=cpp_command) + if key not in cls.cache: + from filelock import FileLock + + lock_dir = get_lock_dir() + lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT) + with lock: + output_path = input_path[:-3] + "so" + if not os.path.exists(output_path): + cmd = shlex.split( + cpp_compile_command( + input=input_path, + output=output_path, + vec_isa=picked_vec_isa, + **cls.cpp_compile_command_flags, + ) + ) + compile_file(input_path, output_path, cmd) + cls.cache[key] = cls._load_library(output_path, key) + cls.cache[key].key = key # type: ignore[union-attr] + + return cls.cache[key] + + +# Customized Python binding for cpp kernels +class CppPythonBindingsCodeCache(CppCodeCache): + cache: Dict[str, Union[CDLL, ModuleType]] = {} + clear = staticmethod(cache.clear) + cpp_compile_command_flags = { + # kernels have no dependency on libtorch + "include_pytorch": False, + "shared": True, + } + entry_function = "kernel" + call_entry_function = "kernel(%s);Py_RETURN_NONE;" + extra_parse_arg = "" + suffix_template = textwrap.dedent( + """ + // Python bindings to call %s(): + #define PY_SSIZE_T_CLEAN + #include + #include + #include + + // This is defined in guards.cpp so we don't need to import PyTorch headers that are slooow. + // We manually link it below to workaround issues with fbcode build. + static void* (*_torchinductor_pyobject_tensor_data_ptr)(PyObject* obj); + + template static inline T parse_arg(PyObject* args, size_t n) { + static_assert(std::is_pointer::value, "arg type must be pointer or long"); + return static_cast(_torchinductor_pyobject_tensor_data_ptr(PyTuple_GET_ITEM(args, n))); + } + template <> inline long parse_arg(PyObject* args, size_t n) { + auto result = PyLong_AsSsize_t(PyTuple_GET_ITEM(args, n)); + if(result == -1 && PyErr_Occurred()) + [[unlikely]] throw std::runtime_error("expected int arg"); + return result; + } + + %s + + static PyObject* %s_py(PyObject* self, PyObject* args) { + try { + if(!PyTuple_CheckExact(args)) + [[unlikely]] throw std::runtime_error("tuple args required"); + if(PyTuple_GET_SIZE(args) != %s) + [[unlikely]] throw std::runtime_error("requires %s args"); + %s + } catch(std::exception const& e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return nullptr; + } catch(...) { + PyErr_SetString(PyExc_RuntimeError, "unhandled error"); + return nullptr; + } + } + + static PyMethodDef py_methods[] = { + {"%s", %s_py, METH_VARARGS, ""}, + {NULL, NULL, 0, NULL}}; + + static struct PyModuleDef py_module = + {PyModuleDef_HEAD_INIT, "%s", NULL, -1, py_methods}; + + PyMODINIT_FUNC PyInit_%s(void) { + const char* str_addr = std::getenv("_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR"); + if(!str_addr) { + PyErr_SetString(PyExc_RuntimeError, "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR must be set"); + return nullptr; + } + std::istringstream iss(str_addr); + uintptr_t addr = 0; + iss >> addr; + _torchinductor_pyobject_tensor_data_ptr = + reinterpret_cast(addr); + return PyModule_Create(&py_module); + } + """ + ) + + @classmethod + def _load_library_inner(cls, path: str, key: str) -> ModuleType: + os.environ["_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR"] = str( + torch._C._dynamo.guards._torchinductor_pyobject_tensor_data_ptr # type: ignore[attr-defined] + ) + return importlib.machinery.ExtensionFileLoader( + f"{key}.{cls.entry_function}", path + ).load_module() # type: ignore[call-arg] + + @classmethod + def load_pybinding( + cls, + argtypes: List[str], + source_code: str, + cuda: bool = False, + num_outputs: int = -1, + ) -> Any: + """ + Wrap a C++ function in fast Python bindings. + + Args: + argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"] + source_code: C++ source code containing a ENTRY_FUNCTION() function + + Returns: + A python version of ENTRY_FUNCTION() + """ + parseargs = ", ".join( + f"parse_arg<{argtype.replace('const ', '')}>(args, {n})" + for n, argtype in enumerate(argtypes) + ) + suffix = cls.suffix_template % ( + cls.entry_function, + cls.extra_parse_arg % num_outputs if cls.extra_parse_arg else "", + cls.entry_function, + len(argtypes), + len(argtypes), + cls.call_entry_function % parseargs, + cls.entry_function, + cls.entry_function, + cls.entry_function, + cls.entry_function, + ) + result = cls.load(source_code + suffix, cuda) + assert isinstance(result, ModuleType) + return getattr(result, cls.entry_function) + + +class CppWrapperCodeCache(CppPythonBindingsCodeCache): + cache: Dict[str, Union[CDLL, ModuleType]] = {} + clear = staticmethod(cache.clear) + cpp_compile_command_flags = { + "include_pytorch": True, + "shared": True, + } + entry_function = "inductor_entry_cpp" + call_entry_function = "return THPVariable_WrapList(inductor_entry_cpp(%s));" + extra_parse_arg = textwrap.dedent( + """ + #include + #include + + template <> inline std::vector parse_arg>(PyObject* args, size_t n) { + return THPVariable_UnpackList(PyTuple_GET_ITEM(args, n)); + } + + std::vector inductor_entry_cpp(std::vector&& inputs) { + auto input_handles = unsafe_alloc_new_handles_from_tensors(inputs); + // For outputs, we only allocate a vector to hold returned tensor handles, + // not allocating the actual output tensor storage here + std::vector output_handles(%s); + + try { + inductor_entry_impl(input_handles.data(), output_handles.data()); + } catch(std::exception const& e) { + PyErr_SetString(PyExc_RuntimeError, e.what()); + return {}; + } catch(...) { + PyErr_SetString(PyExc_RuntimeError, "unhandled error"); + return {}; + } + + return alloc_tensors_by_stealing_from_handles(output_handles.data(), output_handles.size()); + } + """ + ) + + +class PyCodeCache: + cache: Dict[str, ModuleType] = dict() + linemaps: Dict[str, List[Tuple[Any, ...]]] = dict() + clear = staticmethod(cache.clear) + + @classmethod + def write(cls, source_code: str, extra: str = "") -> Tuple[str, str]: + return write(source_code, "py", extra=extra) + + @classmethod + def load( + cls, + source_code: str, + extra: str = "", + linemap: Optional[List[Tuple[int, str]]] = None, + attrs: Optional[Dict[str, Any]] = None, + ) -> ModuleType: + key, path = write(source_code, "py", extra=extra) + return cls.load_by_key_path(key, path, linemap, attrs) + + @classmethod + def load_by_key_path( + cls, + key: str, + path: str, + linemap: Optional[List[Tuple[int, str]]] = None, + attrs: Optional[Dict[str, Any]] = None, + ) -> ModuleType: + if linemap is None: + linemap = [] + if key not in cls.cache: + with open(path) as f: + try: + code = compile(f.read(), path, "exec") + except Exception as e: + raise RuntimeError( + f"Failed to import {path}\n{type(e).__name__}: {e}" + ) from None + mod = ModuleType(f"{__name__}.{key}") + mod.__file__ = path + mod.key = key # type: ignore[attr-defined] + exec(code, mod.__dict__, mod.__dict__) + sys.modules[mod.__name__] = mod + # another thread might set this first + cls.cache.setdefault(key, mod) + # unzip into separate lines/nodes lists + cls.linemaps[path] = list(zip(*linemap)) + + if attrs is not None: + for k, v in attrs.items(): + setattr(mod, k, v) + + return cls.cache[key] + + @classmethod + @functools.lru_cache(None) + def stack_frames_for_code( + cls, path: str, lineno: int + ) -> Optional[List[Dict[str, Any]]]: + if path not in cls.linemaps: + return None + # [(starting_line, ), ...] + lines, nodes = cls.linemaps[path] + p = bisect_right(lines, lineno) + if p == 0: + return None + entry = nodes[p - 1] + if not entry: + return None + + def parse_stack_trace(stack_trace: str) -> List[Dict[str, Any]]: + # ideally fx stores stack traces as data rather than a string + # but this is not along a performance critical path + regex = r'File "(.+)", line (\d+), in (.+)\n' + matches = re.findall(regex, stack_trace) + return [ + {"filename": f, "line": int(l), "name": n} + for f, l, n in reversed(matches) + ] + + return parse_stack_trace(entry) + + +class TritonCodeCache: + @classmethod + def load(cls, kernel_name: str, source_code: str) -> ModuleType: + mod = PyCodeCache.load(source_code) + return getattr(mod, kernel_name) + + +def _cuda_compiler() -> Optional[str]: + if cuda_env.nvcc_exist(config.cuda.cuda_cxx): + return config.cuda.cuda_cxx + if cuda_env.nvcc_exist(os.getenv("CUDACXX")): + return os.getenv("CUDACXX", "") + if cuda_env.nvcc_exist(os.getenv("CUDA_HOME")): + return os.path.join(os.getenv("CUDA_HOME", ""), "bin/nvcc") + return "nvcc" + + +def _cutlass_include_paths() -> List[str]: + cutlass_path = config.cuda.cutlass_dir + return [ + os.path.join(cutlass_path, "include"), + os.path.join(cutlass_path, "tools/library/include"), + os.path.join(cutlass_path, "tools/library/src"), + os.path.join(cutlass_path, "tools/util/include"), + ] + + +def _cuda_lib_options() -> List[str]: + from torch.utils import cpp_extension + + extra_ldflags: List[str] = [] + if is_linux(): + extra_lib_dir = "lib64" + if not os.path.exists( + cpp_extension._join_cuda_home(extra_lib_dir) + ) and os.path.exists(cpp_extension._join_cuda_home("lib")): + # 64-bit CUDA may be installed in "lib" + # Note that it's also possible both don't exist (see _find_cuda_home) - in that case we stay with "lib64" + extra_lib_dir = "lib" + extra_ldflags.append(f"-L{cpp_extension._join_cuda_home(extra_lib_dir)}") + extra_ldflags.append( + f'-L{cpp_extension._join_cuda_home(extra_lib_dir, "stubs")}' + ) + extra_ldflags.append("-lcuda") + extra_ldflags.append("-lcudart") + else: + raise NotImplementedError( + "Unsupported env, failed to find cuda libs! Currently only Linux is supported." + ) + return extra_ldflags + + +def _nvcc_host_compiler_options() -> List[str]: + return [ + "-fPIC", + "-fno-strict-aliasing", + "-fvisibility=hidden", + "-Wconversion", + ] + + +def _nvcc_compiler_options() -> List[str]: + arch = cuda_env.get_cuda_arch() + if arch == "90": + # Required by cutlass compilation. + arch = "90a" + code = [f"sm_{arch}", f"compute_{arch}"] + if config.cuda.enable_cuda_lto: + code += [f"lto_{arch}"] + options = [ + "-t=0", + "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1", + "-w", + f"-gencode=arch=compute_{arch},code=[{','.join(code)}]", + config.cuda.compile_opt_level, + "-std=c++17", + "--expt-relaxed-constexpr", + "-DNDEBUG", + ] + if config.cuda.enable_debug_info: + options.extend(["-lineinfo", "-g", "-DCUTLASS_DEBUG_TRACE_LEVEL=1"]) + if config.cuda.enable_ptxas_info: + options.extend( + [ + "--keep", # Keep the intermediate files for debugging (including ptx, sass, cubin etc.) + "--ptxas-options=--warn-on-local-memory-usage", # warn us if local memory is used in CUDA Kernels + "--ptxas-options=--warn-on-spills", # warn us if register spilling happens in CUDA Kernels + "--resource-usage", # Report on CUDA resource usage (shared mem, registers etc.) + "--source-in-ptx", + ] + ) # Annotate the ptx file with source information + if config.cuda.use_fast_math: + options.extend( + [ + "--use_fast_math", + "-DCUTLASS_USE_TANH_FOR_SIGMOID=1", + ] + ) + return options + + +def cuda_compile_command( + src_files: List[str], + dst_file: str, + dst_file_ext: str, +) -> str: + include_paths = _cutlass_include_paths() + cuda_lib_options = _cuda_lib_options() + nvcc_host_compiler_options = _nvcc_host_compiler_options() + nvcc_compiler_options = _nvcc_compiler_options() + options = ( + nvcc_compiler_options + + [ + f"-Xcompiler {opt}" if "=" in opt else f"-Xcompiler={opt}" + for opt in nvcc_host_compiler_options + ] + + ["-I" + path for path in include_paths] + + cuda_lib_options + ) + src_file = " ".join(src_files) + res = "" + if dst_file_ext == "o": + res = f"{_cuda_compiler()} {' '.join(options)} -c -o {dst_file} {src_file}" + elif dst_file_ext == "so": + options.append("-shared") + res = f"{_cuda_compiler()} {' '.join(options)} -o {dst_file} {src_file}" + else: + raise NotImplementedError(f"Unsupported output file suffix {dst_file_ext}!") + log.debug("CUDA command: %s", res) + return res + + +class DLLWrapper: + """A wrapper for a dynamic library.""" + + def __init__( + self, + lib_path: str, + ): + self.lib_path = lib_path + self.DLL = cdll.LoadLibrary(lib_path) + self.is_open = True + + def close(self): + if self.is_open: + self._dlclose() + self.is_open = False + + def _dlclose(self): + f_dlclose = None + + if is_linux(): + syms = CDLL(None) + if not hasattr(syms, "dlclose"): + # Apline Linux + syms = CDLL("libc.so") + + if hasattr(syms, "dlclose"): + f_dlclose = syms.dlclose + else: + raise NotImplementedError("Unsupported env, failed to do dlclose!") + + if f_dlclose is not None: + f_dlclose.argtypes = [c_void_p] + f_dlclose(self.DLL._handle) + else: + log.warning( + "dll unloading function was not found, library may not be unloaded properly!" + ) + + def __getattr__(self, name): + if not self.is_open: + raise RuntimeError(f"Cannot use closed DLL library: {self.lib_path}") + + method = getattr(self.DLL, name) + + def _wrapped_func(*args): + err = method(*args) + if err: + raise RuntimeError(f"Error in function: {method.__name__}") + + return _wrapped_func + + def __enter__(self): + return self + + def __exit__(self, *args): + self.close() + + def __del__(self): + self.close() + + +class CUDACodeCache: + @dataclasses.dataclass + class CacheEntry: + input_path: str + output_path: str + + cache: Dict[str, CacheEntry] = dict() + clear = staticmethod(cache.clear) + _SOURCE_CODE_SUFFIX = "cu" + + @classmethod + def write(cls, source_code, dst_file_ext) -> Tuple[str, str]: + """ + Writes source code into a file with dst_file_ext as the file extension. + Returns the hash key of source code, and the path to the file. + """ + + cuda_command = repr( + cuda_compile_command(["dummy_input"], "dummy_output", dst_file_ext) + ) + key, input_path = write( + source_code, cls._SOURCE_CODE_SUFFIX, extra=cuda_command + ) + return key, input_path + + @classmethod + def compile(cls, source_code, dst_file_ext) -> Tuple[str, str, str]: + """ + Compiles CUDA source_code into a file with dst_file_ext extension. + Returns a tuple of dst_file_path, hash_key, source_code_path + """ + + key, input_path = cls.write(source_code, dst_file_ext) + if key not in cls.cache: + from filelock import FileLock + + lock_dir = get_lock_dir() + lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT) + with lock: + output_path = input_path[: -len(cls._SOURCE_CODE_SUFFIX)] + dst_file_ext + if not os.path.exists(output_path): + cmd = cuda_compile_command( + [input_path], output_path, dst_file_ext + ).split(" ") + try: + subprocess.check_output( + cmd, stderr=subprocess.STDOUT, env=os.environ + ) + except subprocess.CalledProcessError as error: + raise exc.CUDACompileError(cmd, error.output) from error + cls.cache[key] = CUDACodeCache.CacheEntry(input_path, output_path) + + return (cls.cache[key].output_path, key, input_path) + + @classmethod + def load(cls, source_code, dst_file_ext) -> Tuple[DLLWrapper, str, str]: + """ + Compiles source code and loads the generated .so file. + Returns a tuple of DLLWrapper, hash_key, source_code_path + """ + + if dst_file_ext != "so": + raise RuntimeError( + f"Only support loading a .so file for now. " + f"Requested file extension: {dst_file_ext}. Source code: {source_code}" + ) + dst_file_path, hash_key, source_code_path = cls.compile( + source_code, dst_file_ext + ) + return (DLLWrapper(dst_file_path), hash_key, source_code_path) + + +def caching_device_properties(): + for _, device_interface in get_registered_device_interfaces(): + if device_interface.is_available(): + device_interface.Worker.get_device_properties() + + +def _set_triton_ptxas_path() -> None: + if os.environ.get("TRITON_PTXAS_PATH") is not None: + return + ptxas_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "bin", "ptxas") + ) + if not os.path.exists(ptxas_path): + return + if os.path.isfile(ptxas_path) and os.access(ptxas_path, os.X_OK): + os.environ["TRITON_PTXAS_PATH"] = ptxas_path + else: + warnings.warn(f"{ptxas_path} exists but is not an executable") + + +def _worker_compile( + kernel_name: str, source_code: str, cc: int, device: torch.device +) -> None: + device_interface = get_interface_for_device(device.type) + device_interface.Worker.set_device(device.index) + kernel = TritonCodeCache.load(kernel_name, source_code) + kernel.precompile(warm_cache_only_with_cc=cc) + + +def _load_kernel(kernel_name: str, source_code: str) -> ModuleType: + _set_triton_ptxas_path() + kernel = TritonCodeCache.load(kernel_name, source_code) + kernel.precompile() + return kernel + + +class TritonFuture: + kernel: ModuleType + + def __init__( + self, + kernel_name: str, + source_code: str, + future: Future[Any], + ) -> None: + self.kernel_name = kernel_name + self.source_code = source_code + self.future = future + + # @dynamo_utils.dynamo_timed + def result(self) -> ModuleType: + t0 = time() + if hasattr(self, "kernel"): + return self.kernel + # If the worker failed this will throw an exception. + self.future.result() + kernel = self.kernel = _load_kernel(self.kernel_name, self.source_code) + latency = time() - t0 + if latency > 50: + developer_warning( + f"Detected long compilation time of {latency} seconds for kernel name {self.kernel_name}" + ) + developer_warning(self.source_code) + del self.kernel_name, self.source_code, self.future + return kernel + + +# If this process dies abnormally (e.g. segfault) +# it will not shut down the workers. Instead +# the workers will have their parent reassigned to the +# init process. This launches a separate thread to +# watch for the worker getting reassigned, +# and cleans it up in this case. +# +# This function cannot be an inner function since otherwise mp_context="spawn" would +# not work for ProcessPoolExecutor since inner functions cannot be pickled. +def _async_compile_initializer(orig_ppid) -> None: + def run() -> None: + while True: + sleep(1) + if orig_ppid != os.getppid(): + os.kill(os.getpid(), signal.SIGKILL) + + global _watchdog_thread + _watchdog_thread = Thread(target=run, daemon=True) + _watchdog_thread.start() + # Ignore Ctrl-C (i.e. SIGINT) sent to pool workers to avoid meaningless log spam. + signal.signal(signal.SIGINT, signal.SIG_IGN) + + +_watchdog_thread: Optional[Thread] = None + +# Used to keep track of all process pools invoked so far. +_pool_set: Set[ProcessPoolExecutor] = set() + + +def shutdown_compile_workers() -> None: + """Shut down all outstanding compile-worker pools.""" + global _pool_set + for pool in _pool_set: + pool.shutdown() + _pool_set.clear() + + +class AsyncCompile: + def __init__(self) -> None: + pass + + @staticmethod + @functools.lru_cache(1) + def pool() -> ThreadPoolExecutor: + assert config.compile_threads > 1 + return ThreadPoolExecutor(config.compile_threads) + + @staticmethod + @functools.lru_cache(1) + def process_pool() -> ProcessPoolExecutor: + # ensure properties have been calculated before processes + # are forked + caching_device_properties() + assert config.compile_threads > 1 + orig_ppid = os.getpid() + + ctx = multiprocessing.get_context(config.worker_start_method) + pool = ProcessPoolExecutor( + config.compile_threads, + mp_context=ctx, + initializer=partial(_async_compile_initializer, orig_ppid), + ) + + global _pool_set + _pool_set.add(pool) + + # when this pool is created in a subprocess object, the normal exit handler + # doesn't run, and we need to register our own handler. + # exitpriority has to be high, because another one of the finalizers will + # kill the worker thread that sends the shutdown message to the workers... + multiprocessing.util.Finalize(None, pool.shutdown, exitpriority=sys.maxsize) + return pool + + @classmethod + def warm_pool(cls) -> None: + if config.compile_threads <= 1: + return + _compile_start() + pool = cls.process_pool() + + # We have to fork processes for compiler workers, but the more memory and other resources that are loaded, the + # slower the os.fork time is, quite drastically. It also holds the GIL so we can't put it on another thread. + + # Examples: + # A simple x + x + x script: 10ms seconds in the middle of the program, 2ms at startup + # tf_efficientnet_b0 benchmark: 50ms! in the middle of the program , 3ms at startup + + # So we want to start the workers early when it is still cheap, and also to allow the workers to get + # ready before we have work for them. + + # ProcessPoolExecutor also does not launch the workers until it finds a point when all the workers are idle. + # But if we waited until then fork time will be long and we will be waiting for the processes to initialize. + + # We force them to start here with some YOLOing of the internal methods. + if hasattr(pool, "_start_queue_management_thread"): + pool._start_queue_management_thread() + else: + for _ in range(config.compile_threads): + pool._adjust_process_count() + if hasattr(pool, "_start_executor_manager_thread"): + pool._start_executor_manager_thread() + _compile_end() + + @classmethod + def submit(cls, task: Callable[..., Any]) -> Any: + if config.compile_threads <= 1: + return task() + return cls.pool().submit(task) + + @classmethod + def map(cls, fn: Callable[..., Any], seq: List[Any]) -> List[Any]: + if config.compile_threads <= 1 or len(seq) <= 1: + return list(map(fn, seq)) + return [t.result() for t in [cls.pool().submit(fn, x) for x in seq]] + + def triton( + self, kernel_name: str, source_code: str, device_str: str = "cuda" + ) -> Union[TritonFuture, ModuleType]: + _compile_start() + + if config.compile_threads > 1: + device_interface = get_interface_for_device(device_str) + device = torch.device(device_str, device_interface.current_device()) + cc = device_interface.get_compute_capability(device) + future = self.process_pool().submit( + _worker_compile, kernel_name, source_code, cc, device + ) + return TritonFuture(kernel_name, source_code, future) + else: + return _load_kernel(kernel_name, source_code) + + def multi_kernel(self, *args, **kwargs) -> ModuleType: + """ + Async compile the python shim for multi-kernel. + """ + + def task(): + from torch._inductor.codegen.multi_kernel import MultiKernelCall + + return MultiKernelCall(*args, **kwargs) + + return self.submit(task) + + def cpp(self, source_code: str) -> ModuleType: + def task(): + return CppCodeCache.load(source_code).kernel + + return self.submit(task) + + def cpp_pybinding(self, argtypes: List[str], source_code: str) -> ModuleType: + return self.submit( + functools.partial( + CppPythonBindingsCodeCache.load_pybinding, argtypes, source_code + ) + ) + + def cuda(self, source_code, dst_file_ext): + def task(): + return CUDACodeCache.load(source_code, dst_file_ext)[0] + + return self.submit(task) + + def wait(self, scope: Dict[str, Any]) -> None: + num_kernels = len( + [ + value + for key, value in scope.items() + if isinstance(value, (Future, TritonFuture)) + ] + ) + pbar = tqdm( + total=num_kernels, + desc="Inductor Compilation", + disable=config.disable_progress, + delay=0, + ) + if config.compile_threads > 1: + for key, result in scope.items(): + if config.verbose_progress and not isinstance(pbar, _Faketqdm): + pbar.set_postfix_str(key) + if isinstance(result, (Future, TritonFuture)): + scope[key] = result.result() + pbar.update(1) + + _compile_end() + + +if os.environ.get("TORCH_TNT_IN_USE", "0") == "1": + # When TorchTNT is used, calling warm_pool() here will cause the + # compile workers created not being able to be shut down inside + # shutdown_compile_workers(). This may cause significant QPS drop. + log.info("Do not call AsyncCompile.warm_pool() because TorchTNT is in use.") +else: + AsyncCompile.warm_pool() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/compile_fx.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/compile_fx.py new file mode 100644 index 0000000000000000000000000000000000000000..acc544bd168b6ff6333e1b64d9d69fd4834921d6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/compile_fx.py @@ -0,0 +1,1451 @@ +import contextlib +import functools +import logging +import os +import sys +import time +import warnings +from itertools import count + +from typing import ( + Any, + Callable, + Dict, + FrozenSet, + List, + Optional, + Sequence, + Tuple, + Union, +) +from unittest import mock + +from functorch.compile import min_cut_rematerialization_partition + +import torch.fx +import torch.utils._pytree as pytree +from torch._dynamo import ( + compiled_autograd, + config as dynamo_config, + logging as dynamo_logging, + utils as dynamo_utils, +) +from torch._dynamo.utils import ( + counters, + detect_fake_mode, + lazy_format_graph_code, + optimus_scuba_log, +) +from torch._functorch.aot_autograd import aot_export_module, make_boxed_func +from torch._inductor.codecache import code_hash, CompiledFxGraph, FxGraphCache +from torch._inductor.cudagraph_utils import BoxedDeviceIndex + +from torch._inductor.debug import save_args_for_compile_fx_inner +from torch._inductor.utils import BoxedBool, count_tangents +from torch._logging import trace_structured +from torch._ops import OpOverload +from torch._subclasses.fake_tensor import FakeTensor +from torch._utils_internal import signpost_event +from torch.fx.passes.fake_tensor_prop import FakeTensorProp + +from .._dynamo.backends.common import aot_autograd +from ..fx._lazy_graph_module import _use_lazy_graph_module # type: ignore[attr-defined] +from ..fx.graph import _PyTreeCodeGen +from . import config, metrics +from .debug import DebugContext +from .decomposition import select_decomp_table +from .fx_passes.joint_graph import joint_graph_passes +from .fx_passes.post_grad import post_grad_passes, view_to_reshape +from .fx_passes.pre_grad import pre_grad_passes +from .graph import GraphLowering +from .ir import ExternKernelNode +from .utils import get_dtype_size, has_incompatible_cudagraph_ops, output_node +from .virtualized import V + +if config.is_fbcode(): + from torch._inductor.fb.utils import time_and_log +else: + # no-op decorator + def time_and_log(attr: str, extra_loggings: Optional[Dict[str, str]] = None): + return dynamo_utils.identity + + +log = logging.getLogger(__name__) +perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints") +post_grad_graphs_log = torch._logging.getArtifactLogger(__name__, "post_grad_graphs") +ALIGNMENT = 16 + + +# copy_ fails when trying to write to tensors with memory overlap, +# for expanded dimensions (a dimension which used to have size 1 -> ?) +# we can select one element from that dimension and write to it +# to achieve writing to all values of that dimension of the input tensor +def get_expanded_dims(t): + if not isinstance(t, torch.Tensor): + return None + return [i for i in range(t.ndim) if t.stride(i) == 0 and t.size(i) != 1] + + +def index_expanded_dims(t: torch.Tensor, expanded_dims: List[int]) -> torch.Tensor: + for expanded_dim in expanded_dims: + t = torch.ops.aten.slice(t, expanded_dim, 0, 1) + return t + + +def complex_memory_overlap(t: torch.Tensor) -> bool: + # if torch._debug_has_internal_overlap thinks this tensor potentially has + # memory overlap internally, let's dig deeper to find out whether it's true. + t = index_expanded_dims(t, get_expanded_dims(t)) + if torch._debug_has_internal_overlap(t) != 0: + strides = t.stride() + sizes = t.shape + indices = list(range(len(strides))) + indices = [x for _, x in sorted(zip(strides, indices))] + for i in range(len(strides)): + prev_stride = 1 if i == 0 else strides[indices[i - 1]] + prev_size = 1 if i == 0 else sizes[indices[i - 1]] + if strides[indices[i]] < prev_stride * prev_size: + return True + return False + + +@functools.lru_cache(None) +def _step_logger(): + return dynamo_logging.get_step_logger(log) + + +@functools.lru_cache(None) +def _warn_tf32_disabled(): + if ( + torch.cuda.is_available() + and not torch.backends.cuda.matmul.allow_tf32 + and torch.cuda.get_device_capability() >= (8, 0) + ): + warnings.warn( + "TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. " + "Consider setting `torch.set_float32_matmul_precision('high')` for better performance." + ) + + +def _unlift_graph(mod, gm, graph_signature): + from torch.export.unflatten import _assign_attr, _AttrKind + + state_dict = {} + for name, param in mod.named_parameters(remove_duplicate=False): + state_dict[name] = param + _assign_attr( + param, + gm, + name, + attr_kind=_AttrKind.PARAMETER, + ) + for name, buffer in mod.named_buffers(remove_duplicate=False): + state_dict[name] = buffer + _assign_attr( + buffer, + gm, + name, + attr_kind=_AttrKind.BUFFER, + ) + + placeholder_nodes = [node for node in gm.graph.nodes if node.op == "placeholder"] + lifted_inputs = [] + for node in placeholder_nodes: + node_name = node.name + if node_name in graph_signature.inputs_to_parameters: + lifted_inputs.append(graph_signature.inputs_to_parameters[node_name]) + elif node_name in graph_signature.inputs_to_buffers: + lifted_inputs.append(graph_signature.inputs_to_buffers[node_name]) + else: + assert node_name in graph_signature.user_inputs + lifted_inputs.append(None) + + from torch.export._unlift import _unlift + + outputs = list(gm.graph.nodes)[-1].args[0] + mutated_outputs = [] + for out in outputs: + if out in graph_signature.buffers_to_mutate: + mutated_outputs.append(graph_signature.buffers_to_mutate[out.name]) + else: + mutated_outputs.append(None) + + unlifted_gm = _unlift( + gm, + lifted_inputs, + mutated_outputs, + pytree.LeafSpec(), + None, + state_dict, + {}, + ) + return unlifted_gm + + +def _get_subgraph_names(gm): + for node in gm.graph.nodes: + if node.target == torch.ops.higher_order.cond: + true_subgraph_name = node.args[1].name + false_subgraph_name = node.args[2].name + yield true_subgraph_name + yield false_subgraph_name + + +def _recursive_pre_grad_passes(gm, example_inputs): + for subgraph_name in _get_subgraph_names(gm): + subgraph = getattr(gm, subgraph_name) + # as we don't have recursive example inputs, passing None here + new_subgraph = _recursive_pre_grad_passes(subgraph, example_inputs=None) + setattr(gm, subgraph_name, new_subgraph) + return pre_grad_passes(gm, example_inputs) + + +def _recursive_joint_graph_passes(gm): + for subgraph_name in _get_subgraph_names(gm): + subgraph = getattr(gm, subgraph_name) + _recursive_joint_graph_passes(subgraph) + joint_graph_passes(gm) + + +def _recursive_post_grad_passes(gm, is_inference: bool = False): + for subgraph_name in _get_subgraph_names(gm): + subgraph = getattr(gm, subgraph_name) + _recursive_post_grad_passes(subgraph, is_inference) + post_grad_passes(gm, is_inference) + + +def split_const_gm( + gm: torch.fx.GraphModule, +) -> Tuple[torch.fx.GraphModule, Dict[str, int]]: + """ + This function takes an GraphModule input "gm". + The gm will be split into 2 components, + 1) const_gm, which consists the subgraph of gm that can be constant folded. + 2) gm (being inplace modified,) which returns the graph after constant folding. + + const_output_index is a mapping of corresponding node name from gm to the + output index of const_gm. + Returns (const_gm, const_output_index) + """ + from torch._inductor.constant_folding import ( + CONST_MODULE_TAG, + META_TAG, + MODULE_TAG, + replace_node_with_constant, + run_and_get_constant_graph, + ) + + const_gm = run_and_get_constant_graph(gm) + const_result = const_gm() + + const_outputs = { + x.name: idx for idx, x in enumerate(tuple(const_gm.graph.nodes)[-1].args[0]) + } + + to_erase_node = [] + to_replace_node = [] + const_output_index = {} + for node in gm.graph.nodes: + if node.name in const_outputs: + to_replace_node.append(node) + elif node.meta[META_TAG] == CONST_MODULE_TAG: + to_erase_node.append(node) + + for node in to_replace_node: + new_const_name = "_FOLDED_CONST_" + node.name + replace_node_with_constant( + gm, + node, + const_result[const_outputs[node.name]], + new_const_name, + ) + const_output_index[new_const_name] = const_outputs[node.name] + for node in to_erase_node[::-1]: + if node.users: + for n in node.users: + assert n.meta[META_TAG] == MODULE_TAG, f"node: {node} user not empty." + else: + gm.graph.erase_node(node) + gm.recompile() + + return const_gm, const_output_index + + +def is_tf32_warning_applicable(gm: torch.fx.GraphModule): + aten = torch.ops.aten + tf32_ops = { + aten.mm.default, + aten.addmm.default, + aten.bmm.default, + aten.baddbmm.default, + } + for node in gm.graph.nodes: + if ( + node.op == "call_function" + and node.target in tf32_ops + and isinstance(node.meta.get("val", None), torch.Tensor) + and node.meta["val"].dtype == torch.float32 + and node.meta["val"].device.type == "cuda" + ): + return True + return False + + +@DebugContext.wrap +def count_bytes_inner( + gm: torch.fx.GraphModule, + example_inputs: List[torch.Tensor], + num_fixed: int = 0, + **kwargs, +): + shape_env = _shape_env_from_inputs(example_inputs) + fake_mode = fake_tensor_prop(gm, example_inputs) + + with V.set_fake_mode(fake_mode): + _recursive_post_grad_passes(gm, False) + + graph = GraphLowering(gm, shape_env=shape_env, num_static_inputs=num_fixed) + with V.set_graph_handler(graph), V.set_real_inputs(example_inputs): + graph.run(*example_inputs) + num_bytes, nodes_num_elem, node_runtimes = graph.count_bytes() + metrics.num_bytes_accessed += num_bytes + metrics.nodes_num_elem += nodes_num_elem + metrics.node_runtimes += node_runtimes + return make_boxed_func(gm.forward) + + +def fake_tensor_prop( + gm: torch.fx.GraphModule, + example_inputs: List[torch.Tensor], + force_allow_non_fake_inputs: bool = False, +): + """ + If we can not detect fake mode from the context of inputs, create one. + + The created fake mode will be returned. + """ + fake_mode = detect_fake_mode(example_inputs) + if not fake_mode: + fake_mode = torch._subclasses.FakeTensorMode(allow_non_fake_inputs=True) + FakeTensorProp(gm, mode=fake_mode).propagate(*example_inputs) + else: + ctx = ( + contextlib.nullcontext() + if not force_allow_non_fake_inputs + else mock.patch.object(fake_mode, "allow_non_fake_inputs", True) + ) + with ctx: # type: ignore[attr-defined] + FakeTensorProp(gm, mode=fake_mode).propagate_dont_convert_inputs( + *example_inputs + ) + + return fake_mode + + +# pass config dict back to user +def get_patched_config_dict(config_patches=None) -> Dict[str, Any]: + with config.patch(config_patches): + return config.get_config_copy() + + +@DebugContext.wrap +@torch.utils._python_dispatch._disable_current_modes() +@time_and_log( + attr="compilation time (in seconds)", + extra_loggings={"config_dict": str(get_patched_config_dict())}, +) +# Need this decorator for compile_fx_inner even if we already have one for +# compile_fx. The reason is the compilation for backward graph may happen after +# compile_fx return and we may want to use the _LazyGraphModule for compiling +# the backward graph as well. +@_use_lazy_graph_module(dynamo_config.use_lazy_graph_module) +@dynamo_utils.dynamo_timed(phase_name="inductor_compile") +def compile_fx_inner( + gm: torch.fx.GraphModule, + example_inputs: List[torch.Tensor], + cudagraphs: Optional[BoxedBool] = None, + num_fixed: int = 0, + is_backward: bool = False, + graph_id: Optional[int] = None, + cpp_wrapper: bool = False, + aot_mode: bool = False, + is_inference: bool = False, + boxed_forward_device_index: Optional[BoxedDeviceIndex] = None, + user_visible_outputs: FrozenSet[str] = frozenset(), + layout_opt: Optional[bool] = None, + extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]] = None, +) -> Union[CompiledFxGraph, str]: + """ + Inductor API that compiles a single graph. + + If you change the argument list for this function, make sure you + also update the call to save_args_for_compile_fx_inner below accordingly. + """ + if dynamo_utils.count_calls(gm.graph) == 0 and not aot_mode: + # trigger the real recompilation for _LazyGraphModule before returning + # the forward method. + from torch.fx._lazy_graph_module import _LazyGraphModule + + _LazyGraphModule.force_recompile(gm) + return make_boxed_func(gm.forward) + + assert isinstance( + next(iter(reversed(gm.graph.nodes))).args[0], (tuple, list) + ), f"inductor can only compile FX graphs which return a tuple/list, but got {gm.graph}" + + if config.save_args: + save_args_for_compile_fx_inner( + gm, + example_inputs, + cudagraphs=cudagraphs, + num_fixed=num_fixed, + is_backward=is_backward, + graph_id=graph_id, + cpp_wrapper=cpp_wrapper, + aot_mode=aot_mode, + is_inference=is_inference, + boxed_forward_device_index=boxed_forward_device_index, + user_visible_outputs=user_visible_outputs, + layout_opt=layout_opt, + ) + + if cudagraphs is None: + cudagraphs = BoxedBool(config.triton.cudagraphs) + + # Inputs to fx_codegen_and_compile + # Anything that affects codegen should go here, so if the signature + # of fx_codegen_and_compile changes, the dict should be updated accordingly + graph_kwargs = { + "cudagraphs": cudagraphs, + "num_fixed": num_fixed, + "is_backward": is_backward, + "graph_id": graph_id, + "cpp_wrapper": cpp_wrapper, + "aot_mode": aot_mode, + "is_inference": is_inference, + "user_visible_outputs": user_visible_outputs, + "layout_opt": layout_opt, + "extern_node_serializer": extern_node_serializer, + } + + start = time.time() + + if config.fx_graph_cache and not aot_mode: + compiled_graph = FxGraphCache.load( + fx_codegen_and_compile, gm, example_inputs, graph_kwargs + ) + else: + compiled_graph = fx_codegen_and_compile( + gm, example_inputs, **graph_kwargs # type: ignore[arg-type] + ) + + log.debug("FX codegen and compilation took %.3fs", time.time() - start) + + # check cudagraph disabling reasons from inductor lowering + if cudagraphs and compiled_graph.disabled_cudagraphs_reason: + perf_hint_log.warning( + "skipping cudagraphs due to %s", compiled_graph.disabled_cudagraphs_reason + ) + BoxedBool.disable(cudagraphs) + + # Return the output strides to the caller via TracingContext + context = torch._guards.TracingContext.try_get() + if context is not None and context.output_strides is not None: + assert len(context.output_strides) == 0 + context.output_strides.extend(compiled_graph.output_strides) + + if aot_mode: + return compiled_graph + + if cudagraphs: + # output args are tuple of first argument + output = output_node(gm) + assert len(output.args) == 1 + stack_traces = [ + (arg.stack_trace if isinstance(arg, torch.fx.node.Node) else None) + for arg in output.args[0] + ] + + complex_memory_overlap_inputs = any( + complex_memory_overlap(t) + for t in example_inputs + if isinstance(t, torch.Tensor) + ) + + from torch._inductor.cudagraph_utils import check_for_mutation + + has_mutation_str = check_for_mutation(gm, compiled_graph, num_fixed) + has_mutation = has_mutation_str is not None + + if has_mutation: + compiled_graph.disabled_cudagraphs_reason = has_mutation_str + + cudagraph_tests = [ + (not has_mutation, "mutated inputs"), + (not has_incompatible_cudagraph_ops(gm), "incompatible ops"), + (not complex_memory_overlap_inputs, "complex memory overlap"), + ( + all( + isinstance(t, (torch.Tensor, torch.SymInt)) for t in example_inputs + ), + "non-Tensor inputs", + ), + ] + cudagraph_fail_reasons = [s for b, s in cudagraph_tests if not b] + + if not cudagraph_fail_reasons: + if not config.triton.cudagraph_trees: + # Force specialize all inputs so that CUDA graphs will work + for t in example_inputs: + if isinstance(t, torch.SymInt): + int(t) # guard + + if ( + boxed_forward_device_index is not None + and not is_inference + and not is_backward + ): + boxed_forward_device_index.set(next(iter(compiled_graph.device_idxs))) + + compiled_graph.current_callable = cudagraphify( + compiled_graph.get_current_callable(), + example_inputs, + static_input_idxs=range(num_fixed), + device_index=next(iter(compiled_graph.device_idxs)), + stack_traces=stack_traces, + is_backward=is_backward, + is_inference=is_inference, + constants=tuple(compiled_graph.constants.values()), + ) + else: + BoxedBool.disable(cudagraphs) + + # See [Backward Generation Handling] + # if cudagraph'd the forward and set the device, we need to let the cudagraph manager + # know we are we running the backward even if we will not run it in cudagraphs + if is_backward and config.triton.cudagraph_trees: + assert boxed_forward_device_index is not None + assert boxed_forward_device_index.value is not None + compiled_graph_callable = compiled_graph.get_current_callable() + + manager = torch._inductor.cudagraph_trees.get_manager( + boxed_forward_device_index.value, create_if_none_exists=False + ) + # should already exist from forward + assert manager is not None + + def compiled_artifact(new_inputs): + manager.set_to_running_backward() + return compiled_graph_callable(new_inputs) + + compiled_graph.current_callable = compiled_artifact + + if "cuda" in compiled_graph.device_types: + # prefer better disable_cudagraphs_reason bc stack trace + # TODO: migrate all disable reasons to stack trace, refactor + if compiled_graph.disabled_cudagraphs_reason: + perf_hint_log.warning(compiled_graph.disabled_cudagraphs_reason) + else: + perf_hint_log.warning( + "skipping cudagraphs due to %s", cudagraph_fail_reasons + ) + + # cudagraphs does its own aligning of inputs + if not cudagraphs: + new_callable = align_inputs( + compiled_graph.get_current_callable(), example_inputs, range(num_fixed) + ) + if new_callable is not compiled_graph.get_current_callable(): + compiled_graph.current_callable = new_callable + + _step_logger()( + logging.INFO, + "torchinductor done compiling " + f"{'BACKWARDS' if is_backward else 'FORWARDS'} " + f"graph {graph_id}", + ) + + # aot autograd needs to know to pass in inputs as a list + compiled_graph._boxed_call = True + return compiled_graph + + +def fx_codegen_and_compile( + gm: torch.fx.GraphModule, + example_inputs: List[torch.Tensor], + cudagraphs: Optional[BoxedBool] = None, + num_fixed: int = 0, + is_backward: bool = False, + graph_id: Optional[int] = None, + cpp_wrapper: bool = False, + aot_mode: bool = False, + is_inference: bool = False, + user_visible_outputs: FrozenSet[str] = frozenset(), + layout_opt: Optional[bool] = None, + extern_node_serializer: Optional[Callable[[List[ExternKernelNode]], Any]] = None, +) -> Union[CompiledFxGraph, str]: + if is_tf32_warning_applicable(gm): + _warn_tf32_disabled() + + # lift the maximum depth of the Python interpreter stack + # to adapt large/deep models + sys.setrecursionlimit(max(sys.getrecursionlimit(), 2000)) + + _step_logger()( + logging.INFO, + "torchinductor compiling " + f"{'BACKWARDS' if is_backward else 'FORWARDS'} " + f"graph {graph_id}", + ) + V.debug.fx_graph(gm, example_inputs) + # TODO: Should we actually dump this? It should be redundant with the aot + # structured logs... + # trace_structured("inductor_input_graph", payload_fn=lambda: gm.print_readable(print_output=False)) + + shape_env = _shape_env_from_inputs(example_inputs) + + # Convert view to reshape in the graph. This is necessary primarily for + # layout optimization. Do it unconditionally for uniformity. + # + # It's needed because when we do layout optimization, an contiguous tensor + # in eager mode may becomes a channels last tensor. A view op previously + # can be applied to the contiguous tensor may not be able to be applied + # on the channels tensor any more. An error like + # RuntimeError: view size is not compatible with input tensor's size and stride + # (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead. + # will be printed. + # + # Replace view op to reshape op in this case. + # As an example, timm_resnest/botnet26t_256/convnext_base etc. will fail if we don't do this. + # + # Also this has to be done before FakeTensorProp below to avoid the failed + # .view() call. + view_to_reshape(gm) + + # It is safe to run FakeTensorProp under no_grad because by the time + # we're in inductor, we assume that AOTAutograd has already "taken care" + # of autograd, so there should be no more autograd-related API's in the + # graph. + with torch.no_grad(): + fake_mode = fake_tensor_prop(gm, example_inputs) + + # pattern matcher passes might not preserve striding information + # on node.meta["val"]. if in the future we rely on these being + # correct we will need to fix. + + with V.set_fake_mode(fake_mode): + # has some issues with memory in training + _recursive_post_grad_passes(gm, is_inference=is_inference) + V.debug.fx_graph_transformed(gm, example_inputs) + post_grad_graphs_log.debug("%s", lazy_format_graph_code("AFTER POST GRAD", gm)) + trace_structured( + "inductor_post_grad_graph", + payload_fn=lambda: gm.print_readable(print_output=False), + ) + optimus_scuba_log["inductor_post_grad"] = counters["inductor"] + signpost_event( + "optimus", + "compile_fx.post_grad_passes", + optimus_scuba_log, + ) + + with V.set_fake_mode(fake_mode): + const_output_index = None + const_graph = None + const_code = None + + if aot_mode and config.aot_inductor.use_runtime_constant_folding: + const_gm, const_output_index = split_const_gm(gm) + + const_graph = GraphLowering( + const_gm, + example_inputs=[], + shape_env=shape_env, + num_static_inputs=num_fixed, + graph_id=graph_id, + cpp_wrapper=cpp_wrapper, + aot_mode=aot_mode, + user_visible_outputs=user_visible_outputs, + extern_node_serializer=extern_node_serializer, + is_inference=is_inference, + is_const_graph=True, + ) + with V.set_graph_handler(const_graph): + assert cpp_wrapper, "AOT mode only supports C++ wrapper" + const_graph.run() + + const_code, _ = const_graph.codegen_with_cpp_wrapper() + + graph = GraphLowering( + gm, + # example_inputs will be used by AOTInductor to dry-run the generated code for Triton kernel tuning. + # For the forward pass, we have the real inputs to be used as example_inputs. For the backward pass, + # we currently use fake tensors and defake them later. + example_inputs=example_inputs, + shape_env=shape_env, + num_static_inputs=num_fixed, + graph_id=graph_id, + cpp_wrapper=cpp_wrapper, + aot_mode=aot_mode, + user_visible_outputs=user_visible_outputs, + extern_node_serializer=extern_node_serializer, + is_inference=is_inference, + const_output_index=const_output_index, + const_code=const_code, + const_module=const_graph, + ) + with V.set_graph_handler(graph): + graph.run(*example_inputs) + output_strides: List[Optional[Tuple[int, ...]]] = [] + if graph.graph_outputs is not None: + # We'll put the output strides in the compiled graph so we + # can later return them to the caller via TracingContext + for out in graph.graph_outputs: + if hasattr(out, "layout"): + output_strides.append( + tuple( + V.graph.sizevars.size_hint(s) for s in out.layout.stride + ) + ) + else: + output_strides.append(None) + + metrics_helper = metrics.CachedMetricsHelper() + compiled_fn = graph.compile_to_fn() + + if V.aot_compilation is True: + return compiled_fn + + if cudagraphs and not V.graph.disable_cudagraphs_reason: + from torch._inductor.cudagraph_utils import ( + check_lowering_disable_cudagraph, + ) + + V.graph.disable_cudagraphs_reason = check_lowering_disable_cudagraph( + V.graph.device_node_mapping + ) + + compiled_graph = CompiledFxGraph( + compiled_fn, + graph, + output_strides, + V.graph.disable_cudagraphs_reason, + metrics_helper.get_deltas(), + ) + + return compiled_graph + + +def clone_preserve_strides(x: torch.Tensor): + needed_size = ( + sum((shape - 1) * stride for shape, stride in zip(x.size(), x.stride())) + 1 + ) + buffer = torch.as_strided(x, (needed_size,), (1,)).clone() + return torch.as_strided(buffer, x.size(), x.stride()) + + +def copy_misaligned_inputs( + new_inputs: List[torch.Tensor], check_inputs_idxs: Sequence[int] +) -> None: + for i in check_inputs_idxs: + if new_inputs[i].data_ptr() % ALIGNMENT: + new_inputs[i] = clone_preserve_strides(new_inputs[i]) + + +def get_input_idxs_to_check( + inputs: Union[List[torch.Tensor], Sequence[int]], + static_input_idxs: Sequence[int], +) -> Sequence[int]: + def is_aligned(storage_offset, dtype): + return (storage_offset * get_dtype_size(dtype)) % ALIGNMENT == 0 + + ids_to_check = [] + for i, input in enumerate(inputs): + if ( + isinstance(input, torch.Tensor) + and ( + i not in static_input_idxs + or not is_aligned(input.storage_offset(), input.dtype) + ) + and input.device.type == "cuda" + ): + ids_to_check.append(i) + return ids_to_check + + +def align_inputs_from_check_idxs( + model: Callable[[List[torch.Tensor]], Any], inputs_to_check: Sequence[int] +): + if len(inputs_to_check) == 0: + return model + + def run(new_inputs): + copy_misaligned_inputs(new_inputs, inputs_to_check) + return model(new_inputs) + + return run + + +def align_inputs( + model: Callable[[List[torch.Tensor]], Any], + inputs: List[torch.Tensor], + static_input_idxs: Sequence[int] = (), +): + inputs_to_check = get_input_idxs_to_check(inputs, static_input_idxs) + return align_inputs_from_check_idxs(model, inputs_to_check) + + +@dynamo_utils.dynamo_timed +def cudagraphify( + model: torch.fx.GraphModule, + inputs: List[torch.Tensor], + static_input_idxs: Sequence[int] = (), + *, + device_index: int, + stack_traces: List[Optional[str]], + is_backward: bool, + is_inference: bool, + constants: Tuple[torch.Tensor, ...] = (), +): + from torch._inductor.cudagraph_trees import ( + cudagraphify_impl as new_cudagraphify_impl, + ) + + cudagraphify_fn: Callable[..., Any] + if config.triton.cudagraph_trees: + cudagraphify_fn = functools.partial( + new_cudagraphify_impl, + device_index=device_index, + stack_traces=stack_traces, + is_backward=is_backward, + is_inference=is_inference, + constants=constants, + ) + else: + cudagraphify_fn = cudagraphify_impl + + # if using fake tensors, defer cudagraphs until we get real inputs at runtime + if not any(isinstance(inp, FakeTensor) for inp in inputs): + return cudagraphify_fn(model, inputs, static_input_idxs) + + compiled_fn = None + + def run(new_inputs): + nonlocal compiled_fn + if compiled_fn is None: + with dynamo_utils.preserve_rng_state(): + compiled_fn = cudagraphify_fn(model, new_inputs, static_input_idxs) + return compiled_fn(new_inputs) + + return run + + +def remove_unaligned_input_idxs( + inputs: Union[List[torch.Tensor], Sequence[int]], + static_input_idxs: Sequence[int], +): + """ + We require all inputs to be aligned, so introduce a copy for any + that aren't. + """ + aligned_static_input_idxs = [] + for idx, input in zip(static_input_idxs, inputs): + if isinstance(input, torch.Tensor) and (input.data_ptr() % ALIGNMENT) == 0: + aligned_static_input_idxs.append(idx) + if len(aligned_static_input_idxs) != len(static_input_idxs): + return aligned_static_input_idxs + return static_input_idxs + + +def static_input(x: torch.Tensor): + """ + Copy and input while preserving strides + """ + # TODO(jansel): figure out why this version doesn't work: + # return torch.empty_strided(x.size(), x.stride(), dtype=x.dtype, device=x.device) + needed_size = ( + sum((shape - 1) * stride for shape, stride in zip(x.size(), x.stride())) + 1 + ) + buffer = torch.empty(needed_size, dtype=x.dtype, device=x.device) + return torch.as_strided(buffer, x.size(), x.stride()) + + +def index_expanded_dims_and_copy_( + dst: torch.Tensor, + src: torch.Tensor, + expanded_dims: List[int], +): + "Index into expanded dimensions of both dst and src then copy_" + dst = index_expanded_dims(dst, expanded_dims) + src = index_expanded_dims(src, expanded_dims) + dst.copy_(src) + + +def cudagraphify_impl( + model: torch.fx.GraphModule, + inputs: List[torch.Tensor], + static_input_idxs: Sequence[int] = (), +): + """ + Assumes inputs[static_input_idxs[i]] are always the same memory address + """ + check_input_idxs = get_input_idxs_to_check(inputs, static_input_idxs) + static_input_idxs = remove_unaligned_input_idxs(inputs, static_input_idxs) + copy_misaligned_inputs(inputs, check_input_idxs) + + assert isinstance(inputs, list) + + inps_expanded_dims = [ + get_expanded_dims(x) if idx not in static_input_idxs else [] + for idx, x in enumerate(inputs) + ] + + # allocate static tensor inputs + static_inputs = [ + x + if not isinstance(x, torch.Tensor) + else static_input(x) + if idx not in static_input_idxs + else x.detach() + for idx, x in enumerate(inputs) + ] + + # copy over input values for fresh allocations + for idx, (x, expanded_dims) in enumerate(zip(inputs, inps_expanded_dims)): + if isinstance(x, torch.Tensor) and idx not in static_input_idxs: + index_expanded_dims_and_copy_(static_inputs[idx], x, expanded_dims) + + # warmup + torch.cuda.synchronize() + stream = torch.cuda.Stream() + stream.wait_stream(torch.cuda.current_stream()) + # copy static_inputs because it will be cleared in model + with torch.cuda.stream(stream): + model(list(static_inputs)) + stream.synchronize() + torch.cuda.current_stream().wait_stream(stream) + torch.cuda.synchronize() + + # record + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, stream=stream, capture_error_mode="thread_local"): + static_outputs = model(list(static_inputs)) + if not isinstance(static_outputs, (list, tuple)): + static_outputs = (static_outputs,) + + if config.size_asserts: + + def run(new_inputs): + assert len(static_inputs) == len(new_inputs) + for idx, (dst, src, expanded_dims) in enumerate( + zip(static_inputs, new_inputs, inps_expanded_dims) + ): + if not isinstance(dst, torch.Tensor): + pass + elif idx in static_input_idxs: + assert dst.data_ptr() == src.data_ptr() + else: + # TODO - could make one single op of multiple slices + # and avoid dispatch. + # Could also pre-index the `dst` tensors + index_expanded_dims_and_copy_(dst, src, expanded_dims) + new_inputs.clear() + graph.replay() + return static_outputs + + else: + copy_indices = [ + idx for idx in range(len(static_inputs)) if idx not in static_input_idxs + ] + + def run(new_inputs): + for idx in copy_indices: + expanded_dims = inps_expanded_dims[idx] + index_expanded_dims_and_copy_( + static_inputs[idx], new_inputs[idx], expanded_dims + ) + new_inputs.clear() + graph.replay() + return static_outputs + + return align_inputs_from_check_idxs(run, check_input_idxs) + + +def compile_fx_aot( + model_: torch.fx.GraphModule, + example_inputs_: List[torch.Tensor], + inner_compile: Callable[..., Any] = compile_fx_inner, + config_patches: Optional[Dict[str, Any]] = None, +): + config_patches: Dict[str, Any] = ( + {"cpp_wrapper": True} + if config_patches is None + else {**config_patches, "cpp_wrapper": True} + ) + if ( + "aot_inductor.output_path" not in config_patches + and not config.aot_inductor.output_path + ): + config_patches = { + **config_patches, + "aot_inductor.output_path": code_hash(model_.code), + } + + extern_node_serializer = config_patches.pop("extern_node_serializer", None) + with V.set_aot_compilation(True): + compiled_lib_path = compile_fx( + model_, + example_inputs_, + inner_compile=functools.partial( + inner_compile, + aot_mode=True, + extern_node_serializer=extern_node_serializer, + ), + config_patches=config_patches, + ) + assert os.path.exists( + compiled_lib_path + ), f"AOTInductor compiled library does not exist at {compiled_lib_path}" + return compiled_lib_path + + +_graph_counter = count(0) + + +def fw_compiler_freezing( + aot_autograd_model: torch.fx.GraphModule, + aot_example_inputs: List[torch.Tensor], + dynamo_model: torch.fx.GraphModule, + num_example_inputs: int, + inner_compile: Callable[..., Any], + cudagraphs: BoxedBool, + graph_id: int, + forward_device: BoxedDeviceIndex, +): + from torch._inductor.freezing import convert_conv_weights_to_channels_last, freeze + + # partition_fn won't be called + _recursive_joint_graph_passes(aot_autograd_model) + + layout_opt = GraphLowering.decide_layout_opt(aot_autograd_model, is_inference=True) + if layout_opt: + # make sure meta['val'] is properly setup + fake_tensor_prop(aot_autograd_model, aot_example_inputs, True) + convert_conv_weights_to_channels_last(aot_autograd_model) + + opt_model, preserved_arg_indices = freeze( + dynamo_model, + aot_autograd_model, + aot_example_inputs, # type: ignore[arg-type] + ) + + aot_example_inputs = [aot_example_inputs[ind] for ind in preserved_arg_indices] + num_fixed = len(preserved_arg_indices) - num_example_inputs + + fake_mode = detect_fake_mode(aot_example_inputs) + + # for freezing, all graph outputs should be user visible + *_, model_outputs_node = opt_model.graph.nodes + model_outputs = model_outputs_node.args[0] + user_visible_outputs = [ + n.name for n in model_outputs if isinstance(n, torch.fx.Node) + ] + + # constant params will be real tensors, not fake + tracing_context = torch._guards.TracingContext.try_get() + if tracing_context is not None: + params_flat = tracing_context.params_flat + assert params_flat is not None + for i in range(len(params_flat)): + if i not in preserved_arg_indices: + params_flat[i] = None + + with mock.patch.object(fake_mode, "allow_non_fake_inputs", True): + optimized_function = inner_compile( + opt_model, + aot_example_inputs, + num_fixed=num_fixed, + cudagraphs=cudagraphs, + graph_id=graph_id, + is_inference=True, + boxed_forward_device_index=forward_device, + layout_opt=layout_opt, + user_visible_outputs=user_visible_outputs, + ) + + # aot_inductor codegens a call that takes in just the inputs, so we don't return a wrapper + # that drops constant-ified params + if V.aot_compilation is True: + return optimized_function + + def wrapper(args): + args_new = [args[i] for i in preserved_arg_indices] + args.clear() + return optimized_function(args_new) + + wrapper._boxed_call = True # type: ignore[attr-defined] + + return wrapper + + +@_use_lazy_graph_module(dynamo_config.use_lazy_graph_module) +def compile_fx( + model_: torch.fx.GraphModule, + example_inputs_: List[torch.Tensor], + inner_compile: Callable[..., Any] = compile_fx_inner, + config_patches: Optional[Dict[str, Any]] = None, + decompositions: Optional[Dict[OpOverload, Callable[..., Any]]] = None, +): + """Main entrypoint to a compile given FX graph""" + if config_patches: + with config.patch(config_patches): + return compile_fx( + model_, + example_inputs_, + # need extra layer of patching as backwards is compiled out of scope + inner_compile=config.patch(config_patches)(inner_compile), + decompositions=decompositions, + ) + + if config.cpp_wrapper: + with config.patch( + { + "cpp_wrapper": False, + "triton.autotune_cublasLt": False, + "triton.cudagraphs": False, + "triton.store_cubin": True, + } + ), V.set_real_inputs(example_inputs_): + inputs_ = example_inputs_ + if isinstance(model_, torch.fx.GraphModule): + fake_inputs = [ + node.meta.get("val") + for node in model_.graph.nodes + if node.op == "placeholder" + ] + if all(v is not None for v in fake_inputs): + # Validate devices before switching to fake tensors. + for idx, fi, i in zip(count(), fake_inputs, inputs_): + if fi.device != i.device: + raise ValueError( + f"Device mismatch between fake input and example input at position #{idx}: " + f"{fi.device} vs {i.device}. If the model was exported via torch.export(), " + "make sure torch.export() and torch.aot_compile() run on the same device." + ) + inputs_ = fake_inputs + return compile_fx( + model_, + inputs_, + inner_compile=functools.partial(inner_compile, cpp_wrapper=True), + decompositions=decompositions, + ) + + recursive_compile_fx = functools.partial( + compile_fx, + inner_compile=inner_compile, + decompositions=decompositions, + ) + + if not graph_returns_tuple(model_): + return make_graph_return_tuple( + model_, + example_inputs_, + recursive_compile_fx, + ) + + if isinstance(model_, torch.fx.GraphModule): + if isinstance(model_.graph._codegen, _PyTreeCodeGen): + # this graph is the result of dynamo.export() + return handle_dynamo_export_graph( + model_, + example_inputs_, + recursive_compile_fx, + ) + + model_ = _recursive_pre_grad_passes(model_, example_inputs_) + optimus_scuba_log["inductor_pre_grad"] = counters["inductor"] + signpost_event( + "optimus", + "compile_fx.pre_grad_passes", + optimus_scuba_log, + ) + + if any(isinstance(x, (list, tuple, dict)) for x in example_inputs_): + return flatten_graph_inputs( + model_, + example_inputs_, + recursive_compile_fx, + ) + + assert not config._raise_error_for_testing + num_example_inputs = len(example_inputs_) + cudagraphs = BoxedBool(config.triton.cudagraphs) + forward_device = BoxedDeviceIndex(None) + + graph_id = next(_graph_counter) + + decompositions = ( + decompositions if decompositions is not None else select_decomp_table() + ) + + @dynamo_utils.dynamo_timed + def fw_compiler_base( + model: torch.fx.GraphModule, + example_inputs: List[torch.Tensor], + is_inference: bool, + ): + if is_inference: + # partition_fn won't be called + _recursive_joint_graph_passes(model) + + fixed = torch._inductor.utils.num_fw_fixed_arguments( + num_example_inputs, len(example_inputs) + ) + user_visible_outputs = set() + + if config.keep_output_stride: + *_, model_outputs_node = model.graph.nodes + assert model_outputs_node.op == "output" + model_outputs = pytree.arg_tree_leaves(*model_outputs_node.args) + num_model_outputs = len(model_outputs) + + context = torch._guards.TracingContext.try_get() + # See Note [User Outputs in the inductor graph] + if context is not None and context.fw_metadata and not is_inference: + original_output_start_index = ( + context.fw_metadata.num_mutated_inp_runtime_indices + ) + else: + original_output_start_index = 0 + + if isinstance(model_, torch.fx.GraphModule): + *_, orig_model_outputs_node = model_.graph.nodes + assert orig_model_outputs_node.op == "output" + orig_model_outputs, _ = pytree.tree_flatten( + orig_model_outputs_node.args + ) + num_orig_model_outputs = len(orig_model_outputs) + else: + num_orig_model_outputs = num_model_outputs + + assert num_orig_model_outputs <= num_model_outputs + + # Note [User Outputs in the inductor graph] + # We makes the following assumption + # For inference + # len(orig_model_outputs) == len(model_outputs) + # For training + # len(orig_model_outputs) <= len(model_outputs) + # During training, most of the time the model_outputs starts with + # original module's outputs followed by saved activations. + # But this can be not true if the model have inplace updated tensors. + # AOTAutograd will make those tensors being returned before the original + # module's output. + # To make things safe, we'll use original_output_start_index field + # set by AOTAutograd to decide where the original module outputs start. + orig_output_end_idx = original_output_start_index + num_orig_model_outputs + # Sanity chec: we are about to splice out the "user" outputs from the full set + # of "graph" outputs. Make sure we're within bounds. + assert orig_output_end_idx <= num_model_outputs + + user_visible_outputs = { + n.name + for n in model_outputs[original_output_start_index:orig_output_end_idx] + if isinstance(n, torch.fx.Node) + } + + return inner_compile( + model, + example_inputs, + num_fixed=fixed, + cudagraphs=cudagraphs, + graph_id=graph_id, + is_inference=is_inference, + boxed_forward_device_index=forward_device, + user_visible_outputs=user_visible_outputs, + ) + + fw_compiler = functools.partial(fw_compiler_base, is_inference=False) + + if config.freezing and not torch.is_grad_enabled(): + inference_compiler = functools.partial( + fw_compiler_freezing, + dynamo_model=model_, + num_example_inputs=num_example_inputs, + inner_compile=inner_compile, + cudagraphs=cudagraphs, + graph_id=graph_id, + forward_device=forward_device, + ) + else: + inference_compiler = functools.partial(fw_compiler_base, is_inference=True) + + def partition_fn(graph, joint_inputs, **kwargs): + _recursive_joint_graph_passes(graph) + return min_cut_rematerialization_partition( + graph, joint_inputs, **kwargs, compiler="inductor" + ) + + @dynamo_utils.dynamo_timed + @dynamo_utils.maybe_cprofile + def bw_compiler(model: torch.fx.GraphModule, example_inputs: List[torch.Tensor]): + fixed = count_tangents(model) + return inner_compile( + model, + example_inputs, + num_fixed=fixed, + cudagraphs=cudagraphs, + is_backward=True, + graph_id=graph_id, + boxed_forward_device_index=forward_device, + ) + + # TODO: can add logging before/after the call to create_aot_dispatcher_function + # in torch._functorch/aot_autograd.py::aot_module_simplified::aot_function_simplified::new_func + # once torchdynamo is merged into pytorch + + fake_mode = detect_fake_mode(example_inputs_) or torch._subclasses.FakeTensorMode( + allow_non_fake_inputs=True + ) + tracing_context = ( + torch._guards.TracingContext.try_get() + or torch._guards.TracingContext(fake_mode) + ) + + if V.aot_compilation is True: + gm, graph_signature = aot_export_module( + model_, example_inputs_, trace_joint=False, decompositions=decompositions + ) + unlifted_gm = _unlift_graph(model_, gm, graph_signature) + if "dynamo_flat_name_to_original_fqn" in model_.meta: + unlifted_gm.meta["dynamo_flat_name_to_original_fqn"] = model_.meta[ + "dynamo_flat_name_to_original_fqn" + ] + with V.set_fake_mode(fake_mode), compiled_autograd.disable(): + return inference_compiler(unlifted_gm, example_inputs_) + + with V.set_fake_mode(fake_mode), torch._guards.tracing( + tracing_context + ), compiled_autograd.disable(): + return aot_autograd( + fw_compiler=fw_compiler, + bw_compiler=bw_compiler, + inference_compiler=inference_compiler, + decompositions=decompositions, + partition_fn=partition_fn, + keep_inference_input_mutations=True, + )(model_, example_inputs_) + + +def _shape_env_from_inputs(inputs: List[torch.Tensor]): + shape_env = None + fake_mode = detect_fake_mode(inputs) + + # TODO(voz): It would be nice to enable this assert, but there are lots of tests that + # pass in real inputs for now. + # if len(inputs) > 0: + # assert fake_mode is not None, breakpoint() + + if fake_mode is not None: + return fake_mode.shape_env + + # When there are no tensor inputs, get shape_env from the first SymInt. + for input in inputs: + if isinstance(input, torch.SymInt): + return input.node.shape_env + + # TODO(voz): Should we always have one anyway? + return None + + +def graph_returns_tuple(gm: torch.fx.GraphModule): + """True if a FX graph returns a tuple""" + if not isinstance(gm, torch.fx.GraphModule): + return True # can't check this, assume true + (rv,) = output_node(gm).args + if isinstance(rv, (list, tuple)): + return True + if ( + isinstance(rv, torch.fx.node.Node) + and hasattr(rv.target, "_schema") + and len(rv.target._schema.returns) > 1 + and all(str(ret.type) == "Tensor" for ret in rv.target._schema.returns) + ): + # for graphs whose result is one node with multiple outputs + return True + return False + + +def make_graph_return_tuple( + gm: torch.fx.GraphModule, + inputs: List[torch.Tensor], + compile_gm: Callable[..., Any], +): + """ + Mutate gm so it returns a tuple. This is only needed for graphs + not created by torchdynamo that return non-tuples. + """ + node = output_node(gm) + (rv,) = node.args + rv, spec = pytree.tree_flatten(rv) + with gm.graph.inserting_before(node): + gm.graph.output(rv) + gm.graph.erase_node(node) + assert graph_returns_tuple(gm) + + compiled_fn = compile_gm(gm, inputs) + + @functools.wraps(compiled_fn) + def wrapper(*args, **kwargs): + return pytree.tree_unflatten(compiled_fn(*args, **kwargs), spec) + + return wrapper + + +def flatten_graph_inputs(gm: torch.fx.GraphModule, inputs, compile_gm): + """ + Mutate inputs so that they are flat and wrap gm such that it + accepts those inputs. This is only needed for graphs not created + by torchdynamo that take bumpy inputs. + """ + inputs, spec = pytree.tree_flatten(inputs) + + class GmWrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.gm = gm + + def forward(self, *args): + args: List[Any] = list(args) + return self.gm(*pytree.tree_unflatten(args, spec)) + + compiled_fn = compile_gm(GmWrapper(), inputs) + + @functools.wraps(compiled_fn) + def wrapper(*args): + # note this doesn't check the spec, assuming it is the same + return compiled_fn(*pytree.arg_tree_leaves(*args)) + + return wrapper + + +def handle_dynamo_export_graph( + gm: torch.fx.GraphModule, + inputs: List[torch.Tensor], + compile_gm: Callable[..., Any], +): + """ + `torch._dynamo.export` embeds pytrees in the FX graph codegen object, + convert that to a normal FX graph so inductor can compile it. + """ + codegen = gm.graph._codegen + gm.graph._codegen = torch.fx.graph.CodeGen() + gm.recompile() + + compiled_fn = compile_gm(gm, codegen.process_inputs(*inputs)) + + @functools.wraps(compiled_fn) + def wrapper(*args): + return codegen.process_outputs(compiled_fn(*codegen.process_inputs(*args))) + + return wrapper diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/cudagraph_utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/cudagraph_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..21ada6547b7b00f056775cd9551ea497960b9847 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/cudagraph_utils.py @@ -0,0 +1,105 @@ +import dataclasses +from typing import Dict, Iterable, Optional + +import torch +from torch._inductor.codecache import CompiledFxGraph + + +def get_mutating_use_stack_trace(placeholder_node: torch.fx.Node) -> Optional[str]: + # reinplaced uses might have a single, non-copy_ use + if len(placeholder_node.users) == 1: + return next(iter(placeholder_node.users)).meta.get("stack_trace", None) + + for use in placeholder_node.users: + if use.target == torch.ops.aten.copy_.default: + if stack_trace := use.meta.get("stack_trace", None): + return stack_trace + + return None + + +def format_default_skip_message(reason: str) -> str: + return f"skipping cudagraphs due to {reason}" + + +def get_mutation_stack_trace( + gm: torch.fx.GraphModule, mutation_indices: Iterable[int] +) -> str: + stack_trace: Optional[str] = "" + placeholders = [node for node in gm.graph.nodes if node.op == "placeholder"] + + for idx in mutation_indices: + placeholder = placeholders[idx] + if stack_trace := get_mutating_use_stack_trace(placeholder): + break + + if stack_trace: + msg = f"skipping cudagraphs due to mutation on input. Found from : \n {stack_trace}" + return msg + + return format_default_skip_message("mutated inputs") + + +def check_for_mutation( + gm: torch.fx.GraphModule, compiled_graph: CompiledFxGraph, num_fixed: int +) -> Optional[str]: + default_msg = format_default_skip_message("mutated inputs") + + # doesnt work for non-trees because the warmup run would apply mutation twice + if torch._inductor.config.triton.cudagraph_trees: + # checking if mutation is only on parameters/static inputs + mutation_indices = [ + idx for idx in compiled_graph.mutated_input_idxs if idx >= num_fixed + ] + has_mutation = len(mutation_indices) != 0 + if not has_mutation: + return None + + return get_mutation_stack_trace(gm, mutation_indices) + + else: + has_mutation = len(compiled_graph.mutated_inputs) != 0 + return None if not has_mutation else default_msg + + +def get_use_stack_trace(node) -> Optional[str]: + for use in node.users: + if stack_trace := use.meta.get("stack_trace", None): + return stack_trace + return None + + +def check_multiple_devices_or_any_cpu_nodes( + device_node_mapping: Dict[torch.device, torch.fx.Node] +) -> Optional[str]: + if cpu_node := device_node_mapping.get(torch.device("cpu")): + if stack_trace := get_use_stack_trace(cpu_node): + return format_default_skip_message( + f"cpu device. Found from : \n {stack_trace}" + ) + + return format_default_skip_message("cpu device") + + if ( + len(device_node_mapping) == 1 + and next(iter(device_node_mapping.keys())).type == "cuda" + ): + return None + + keys_repr = (repr(key) for key in device_node_mapping.keys()) + return format_default_skip_message(f"multiple devices: {', '.join(keys_repr)}") + + +def check_lowering_disable_cudagraph( + device_node_mapping: Dict[torch.device, torch.fx.Node] +): + return check_multiple_devices_or_any_cpu_nodes(device_node_mapping) + + +@dataclasses.dataclass +class BoxedDeviceIndex: + value: Optional[int] + + def set(self, device_idx: Optional[int]): + assert device_idx is None or isinstance(device_idx, int) + self.value = device_idx diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/dependencies.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/dependencies.py new file mode 100644 index 0000000000000000000000000000000000000000..44d6b19f62d9f5b67aa2ab82606ead1b42bc9595 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/dependencies.py @@ -0,0 +1,506 @@ +import collections +import dataclasses +import itertools +import logging +import re +import typing +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union +from unittest.mock import patch + +import sympy + +import torch +from torch.fx.experimental.symbolic_shapes import free_unbacked_symbols + +from .codegen.common import index_prevent_reordering +from .utils import ( + get_dtype_size, + reduction_num_outputs, + sympy_index_symbol, + sympy_str, + sympy_subs, + VarRanges, +) +from .virtualized import OpsHandler, ReductionType, V + +log = logging.getLogger(__name__) +is_indirect = re.compile(r"indirect|tmp").search +Dep = Union["MemoryDep", "StarDep", "WeakDep"] + + +class MemoryDep(typing.NamedTuple): + name: str + index: sympy.Expr # type: ignore[assignment] + var_names: Tuple[sympy.Symbol, ...] + size: Tuple[sympy.Expr, ...] + + def __repr__(self): + return f"MemoryDep({self.name!r}, {self.index}, {self.ranges})" + + @property + def ranges(self) -> Dict[sympy.Symbol, sympy.Expr]: + """{c0: 128, c1: 512, ...}""" + return dict(zip(self.var_names, self.size)) + + def get_numel(self) -> sympy.Expr: + if self.is_indirect(): + numel = V.graph.get_numel(self.name) + else: + vars = set(self.index.free_symbols) + numel = sympy.Integer(1) + for var, size in zip(self.var_names, self.size): + if var in vars: + numel = numel * size + return numel + + def rename(self, renames: Dict[str, str]) -> "MemoryDep": + if self.name in renames: + return MemoryDep( + renames[self.name], self.index, var_names=self.var_names, size=self.size + ) + return self + + def numbytes_hint(self): + return V.graph.sizevars.size_hint(self.get_numel()) * get_dtype_size( + V.graph.get_dtype(self.name) + ) + + def has_unbacked_symbols(self): + return len(free_unbacked_symbols(self.get_numel())) > 0 + + def is_contiguous(self) -> bool: + return isinstance(self.index, sympy.Symbol) and self.index in self.var_names + + def is_scalar(self) -> bool: + if isinstance(self.index, sympy.Symbol): + return self.index not in self.var_names and not self.is_indirect() + return isinstance(self.index, (int, sympy.Integer)) + + def is_indirect(self) -> bool: + return any(is_indirect(v.name) for v in self.index.free_symbols) # type: ignore[attr-defined] + + +class StarDep(typing.NamedTuple): + # depends on the entire buffer + name: str + + @property + def index(self): + raise NotImplementedError("StarDep does not have an index") + + def get_numel(self) -> sympy.Expr: + return V.graph.get_numel(self.name) + + def rename(self, renames: Dict[str, str]) -> "StarDep": + if self.name in renames: + return StarDep(renames[self.name]) + return self + + def numbytes_hint(self): + return V.graph.sizevars.size_hint(self.get_numel()) * get_dtype_size( + V.graph.get_dtype(self.name) + ) + + def has_unbacked_symbols(self): + return len(free_unbacked_symbols(self.get_numel())) > 0 + + def is_contiguous(self) -> bool: + return False + + def is_scalar(self) -> bool: + return False + + def is_indirect(self) -> bool: + return False + + +# Used for tracking mutation ordering +# if A reads a buffer and B mutates it +# B must be ordered after A +# +# It is weak because if it turns out A's read is never used, we can still +# eliminate it +class WeakDep(typing.NamedTuple): + name: str + + @property + def index(self): + raise NotImplementedError("WeakDep does not have an index") + + def get_numel(self) -> sympy.Expr: + return sympy.Integer(1) + + def rename(self, renames: Dict[str, str]) -> "WeakDep": + if self.name in renames: + return WeakDep(renames[self.name]) + return self + + def numbytes_hint(self): + return 1 # Purely inserted for ordering, not an actual dep + + def has_unbacked_symbols(self): + return False + + def is_contiguous(self) -> bool: + return False + + +class IndexExprDep(typing.NamedTuple): + index: sympy.Expr # type: ignore[assignment] + var_names: Tuple[sympy.Symbol, ...] + size: Tuple[sympy.Expr, ...] + + +@dataclasses.dataclass +class ReadWrites: + reads: Set[Dep] + writes: Set[Dep] + index_exprs: Set[IndexExprDep] + range_vars: Optional[List[sympy.Expr]] = None + var_ranges: Optional[VarRanges] = None + op_counts: typing.Counter[str] = dataclasses.field( + default_factory=collections.Counter + ) + + def rename(self, renames: typing.Dict[str, str]) -> "ReadWrites": + return ReadWrites( + {dep.rename(renames) for dep in self.reads}, + {dep.rename(renames) for dep in self.writes}, + self.index_exprs, + self.range_vars, + self.var_ranges, + op_counts=self.op_counts, + ) + + def with_read(self, dep: Dep) -> "ReadWrites": + assert isinstance(dep, (WeakDep, StarDep)) + return ReadWrites( + set.union(self.reads, {dep}), + self.writes, + self.index_exprs, + self.range_vars, + self.var_ranges, + op_counts=self.op_counts, + ) + + def merge(self, other: "ReadWrites"): + reads = set.union(self.reads, other.reads) + writes = set.union(self.writes, other.writes) + index_exprs = set.union(self.index_exprs, other.index_exprs) + op_counts = collections.Counter(self.op_counts) + op_counts.update(other.op_counts) + return ReadWrites(reads - writes, writes, index_exprs, op_counts=op_counts) + + @staticmethod + def merge_list(read_writes: List["ReadWrites"]): + all_writes = set.union(*[rw.writes for rw in read_writes]) + all_reads = set.union(*[rw.reads for rw in read_writes]) - all_writes + all_index_exprs = set.union(*[rw.index_exprs for rw in read_writes]) + + op_counts: typing.Counter[Any] = collections.Counter() + for rw in read_writes: + op_counts.update(rw.op_counts) + + return ReadWrites(all_reads, all_writes, all_index_exprs, op_counts=op_counts) + + def remove_reads(self, rem_reads): + return ReadWrites( + self.reads - rem_reads, + self.writes, + self.index_exprs, + self.range_vars, + self.var_ranges, + op_counts=self.op_counts, + ) + + def reads_and_writes(self): + return itertools.chain(self.reads, self.writes) + + +class _RecordLoadStoreInner(V.MockHandler): # type: ignore[name-defined] + def __init__(self, var_ranges: VarRanges, normalize: bool): + super().__init__() + self._reads: Set[Dep] = set() + self._writes: Set[MemoryDep] = set() + self._index_exprs: Set[IndexExprDep] = set() + self._var_ranges: VarRanges = var_ranges + self._normalize: bool = normalize + + def canonicalize( + self, index: sympy.Expr + ) -> Tuple[sympy.Expr, Tuple[sympy.Symbol, ...], Tuple[sympy.Expr, ...]]: + if not self._normalize: + sizes = [V.graph.sizevars.simplify(x) for x in self._var_ranges.values()] + var_names = tuple( + k for k, v in zip(self._var_ranges.keys(), sizes) if v != 1 + ) + sizes = tuple(v for v in sizes if v != 1) + return index, var_names, sizes # type: ignore[return-value] + + # Try to further simplify the indexes even if simplify_loops didn't + # convert it to the simplest form because of the interference from + # different indexing formulas. + free_symbols = index.free_symbols + var_ranges = { + k: V.graph.sizevars.simplify(v) + for k, v in self._var_ranges.items() + # TODO(jansel): explore this further normalization + # if k in free_symbols + } + index_vars = [*var_ranges.keys()] + sizes = tuple(var_ranges.values()) + new_sizes, reindex, prune = V.graph.sizevars._simplify_loops( + index_vars, + sizes, + index_prevent_reordering([index], index_vars, sizes), + ) + + # assign new variables each dimension to deal with numbering mismatches + # d0, d1, d2 could become d0, d2 -- which won't match d0, d1 + new_vars, add_var = var_builder(canonicalization_prefix()) + replacement = dict(zip(index_vars, reindex([add_var(x) for x in new_sizes]))) + index = sympy_subs(sympy.expand(index), replacement) + + new_vars = [*new_vars.keys()] + new_sizes = [*new_sizes] + free_symbols = index.free_symbols + while new_vars and new_vars[-1] not in free_symbols: + # Reduction has last (reduced) dim in its sizes, but + # downstream users won't. Normalize this away. + new_vars.pop() + new_sizes.pop() + return index, tuple(new_vars), tuple(new_sizes) # type: ignore[arg-type] + + def load(self, name: str, index: sympy.Expr) -> str: + self._reads.add(MemoryDep(name, *self.canonicalize(index))) + return f"load({name}, {sympy_str(index)})" + + def load_seed(self, name: str, index: int): + assert isinstance(index, int) + return self.load(name, sympy.Integer(index)) + + def store(self, name: str, index: sympy.Expr, value: str, mode=None) -> str: + self._writes.add(MemoryDep(name, *self.canonicalize(index))) + return f"store({name}, {sympy_str(index)}, {value}, {mode})" + + def store_reduction(self, name: str, index, value) -> str: + return self.store(name, index, f"store_reduction({value})") + + def index_expr(self, index: sympy.Expr, dtype) -> str: + self._index_exprs.add(IndexExprDep(*self.canonicalize(index))) + return f"index_expr({sympy_str(index)}, {dtype})" + + def bucketize( + self, + values, + offsets_name: str, + offsets_size: sympy.Expr, + indexing_dtype: torch.dtype, + right: bool, + ): + self._reads.add(StarDep(offsets_name)) + return f"bucketize({values}, {offsets_name}, {sympy_str(offsets_size)}, {indexing_dtype}, {right})" + + +class _OpCounter: + """Shim to count how many times each op is used""" + + def __init__(self, inner): + super().__init__() + self.parent_handler = inner + self._op_counts: typing.Counter[Any] = collections.Counter() + + def __getattr__(self, name): + self._op_counts[name] += 1 + return getattr(self.parent_handler, name) + + +class RecordLoadStore(V.KernelFormatterHandler): # type: ignore[name-defined] + def __init__(self, var_ranges: VarRanges, normalize: bool): + parent_handler = _RecordLoadStoreInner( + var_ranges=var_ranges, normalize=normalize + ) + parent_handler = _OpCounter(parent_handler) + super().__init__(parent_handler=parent_handler) + + +def var_builder(prefix: str) -> Tuple[VarRanges, Callable[[sympy.Expr], sympy.Symbol]]: + cnt = itertools.count() + var_ranges: VarRanges = dict() + + def add_var(length: sympy.Expr) -> sympy.Symbol: + v = sympy_index_symbol(f"{prefix}{next(cnt)}") + var_ranges[v] = length + return v + + return var_ranges, add_var + + +def index_vars_no_squeeze(*argsizes: Tuple[sympy.Expr, ...], prefix: str): + var_ranges, add_var = var_builder(prefix) + args: List[List[sympy.Symbol]] = [] + for size in argsizes: + args.append(list(map(add_var, size))) + return args, var_ranges + + +def index_vars_squeeze(*argsizes: Tuple[sympy.Expr, ...], prefix: str = "d"): + from .ir import SqueezeView + + var_ranges, add_var = var_builder(prefix) + args: List[List[sympy.Expr]] = [] + new_sizes: List[List[sympy.Expr]] = [] + for size in argsizes: + new_size, reindex = SqueezeView.squeezer(size) + new_sizes.append(new_size) + args.append(reindex(list(map(add_var, new_size)))) + return args, var_ranges + + +def extract_read_writes( + fn: Callable[..., Any], + *argsizes: Tuple[sympy.Expr, ...], + normalize: bool = False, + prefix: str = "d", +): + args, var_ranges = index_vars_squeeze(*argsizes, prefix=prefix) + rw = RecordLoadStore(var_ranges, normalize=normalize) + with V.set_ops_handler(rw): + fn(*args) + + if normalize: + range_vars = [] # Number of vars could differ due to normalization + else: + range_vars = list(itertools.chain.from_iterable(args)) + + inner = rw.parent_handler.parent_handler + return ReadWrites( + set(inner._reads), + set(inner._writes), + inner._index_exprs, + range_vars, + var_ranges, + rw.parent_handler._op_counts, + ) + + +def extract_input_node_reduction_ranges( + input_node: "torch._inductor.ir.TensorBox", +) -> Tuple[Optional[List[sympy.Expr]], Optional[List[sympy.Expr]]]: + """ + Returns the size and reduction size of all inputs, if the sizes and reduction_sizes (if exist) are all the same. + It's possible that a node has multiple inputs, some are Reduction nodes and others are Pointwise nodes. + In this case, reduction_sizes of the Reduction nodes need to be the same. + Otherwise returns (None, None). + """ + + from .ir import ComputedBuffer, Loops + + if isinstance(input_node.data, ComputedBuffer): + # Input node has already been realized. Return its size and reduction_size. + size = input_node.get_size() + reduction_size = input_node.get_reduction_size() + if len(reduction_size) > 0: + return (size, reduction_size) + else: + return (None, None) + + if not isinstance(input_node.data.data, Loops): # type: ignore[attr-defined] + # Other IRNodes do not have reduction_ranges. + return (None, None) + + # There is one issue: what if there are views / permutations between the input node and its dependent realized nodes? + # The current method still uses reduction ranges from the dependent realized node, which is not ideal. + # Is there a way to check whether there are permutations inbetween? + reads = input_node.get_reads() + reduction_size = None + size = None + while reduction_size is None and len(reads) > 0: + seen = set() + new_reads = [] + for read in reads: + if not isinstance(read, MemoryDep): + continue + if read.name in seen: + continue + seen.add(read.name) + buffer = V.graph.get_buffer(read.name) + if buffer is None: + continue + if ( + isinstance(buffer, ComputedBuffer) + and len(buffer.get_reduction_size()) > 0 + ): + if reduction_size is None: + reduction_size = buffer.get_reduction_size() + size = buffer.get_size() + elif ( + reduction_size != buffer.get_reduction_size() + or size != buffer.get_size() + ): + return (None, None) + else: + new_reads.extend(buffer.get_reads()) + if reads == new_reads: + return (size, reduction_size) + else: + reads = new_reads + return (size, reduction_size) + + +def canonicalization_prefix(): + return "c" + + +# ops handler which computes all the free unbacked symbols for an IR +class FreeUnbackedSymbolsOpsHandler: + symbols: Set[sympy.Symbol] + + def __init__(self): + self.symbols = set() + + def __getattr__(self, name: str) -> Callable[..., Any]: + def inner(*args, **kwargs): + for a in itertools.chain(args, kwargs.values()): + if isinstance(a, (sympy.Expr, sympy.logic.boolalg.Boolean)): + self.symbols |= free_unbacked_symbols(a) + + return inner + + def indirect_indexing(self, index_var, size, check=True) -> sympy.Symbol: + assert not isinstance(index_var, (sympy.Expr, sympy.logic.boolalg.Boolean)) + self.symbols |= free_unbacked_symbols(size) + return sympy_index_symbol(f"({str(index_var)})") + + def frexp(self, x): + return (None,) * 2 + + def reduction( + self, + dtype: torch.dtype, + src_dtype: torch.dtype, + reduction_type: ReductionType, + value: Union[None, Tuple[None, ...]], + ) -> Union[None, Tuple[None, ...]]: + num_values = reduction_num_outputs(reduction_type) + return (None,) * num_values if num_values > 1 else None + + +def _typecheck_FreeUnbackedSymbolsOpsHandler( + h: FreeUnbackedSymbolsOpsHandler, +) -> OpsHandler[None]: + return h + + +def extract_free_unbacked_symbols(fn: Callable[..., Any], index, rindex=None): + from .ir import FlexibleLayout + + args = [index, rindex] if rindex is not None else [index] + handler = FreeUnbackedSymbolsOpsHandler() + # NB: I cargo culted the allow_indexing patch here, I don't understand why + # people do this all over + with V.set_ops_handler(handler), patch.object( + FlexibleLayout, "allow_indexing", True + ): + fn(*args) + return handler.symbols diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/exc.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/exc.py new file mode 100644 index 0000000000000000000000000000000000000000..d9076e1c1808bb4d9a5a37b2774fbd9d839d36bd --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/exc.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +import os +import tempfile +import textwrap +from functools import lru_cache + +if os.environ.get("TORCHINDUCTOR_WRITE_MISSING_OPS") == "1": + + @lru_cache(None) + def _record_missing_op(target): + with open(f"{tempfile.gettempdir()}/missing_ops.txt", "a") as fd: + fd.write(str(target) + "\n") + +else: + + def _record_missing_op(target): # type: ignore[misc] + pass + + +class OperatorIssue(RuntimeError): + @staticmethod + def operator_str(target, args, kwargs): + lines = [f"target: {target}"] + [ + f"args[{i}]: {arg}" for i, arg in enumerate(args) + ] + if kwargs: + lines.append(f"kwargs: {kwargs}") + return textwrap.indent("\n".join(lines), " ") + + +class MissingOperatorWithoutDecomp(OperatorIssue): + def __init__(self, target, args, kwargs): + _record_missing_op(target) + super().__init__(f"missing lowering\n{self.operator_str(target, args, kwargs)}") + + +class MissingOperatorWithDecomp(OperatorIssue): + def __init__(self, target, args, kwargs): + _record_missing_op(target) + super().__init__( + f"missing decomposition\n{self.operator_str(target, args, kwargs)}" + + textwrap.dedent( + f""" + + There is a decomposition available for {target} in + torch._decomp.get_decompositions(). Please add this operator to the + `decompositions` list in torch._inductor.decompositions + """ + ) + ) + + +class LoweringException(OperatorIssue): + def __init__(self, exc: Exception, target, args, kwargs): + super().__init__( + f"{type(exc).__name__}: {exc}\n{self.operator_str(target, args, kwargs)}" + ) + + +class InvalidCxxCompiler(RuntimeError): + def __init__(self): + from . import config + + super().__init__( + f"No working C++ compiler found in {config.__name__}.cpp.cxx: {config.cpp.cxx}" + ) + + +class CppWrapperCodeGenError(RuntimeError): + def __init__(self, msg: str): + super().__init__(f"C++ wrapper codegen error: {msg}") + + +class CppCompileError(RuntimeError): + def __init__(self, cmd: list[str], output: str): + if isinstance(output, bytes): + output = output.decode("utf-8") + + super().__init__( + textwrap.dedent( + """ + C++ compile error + + Command: + {cmd} + + Output: + {output} + """ + ) + .strip() + .format(cmd=" ".join(cmd), output=output) + ) + + +class CUDACompileError(CppCompileError): + pass diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_utils.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1cff2844855f9ba1d2328f7036aca61f68da96c5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/fx_utils.py @@ -0,0 +1,220 @@ +import operator +from collections import defaultdict +from typing import Any, Callable, DefaultDict, Dict, Optional, Tuple, Type + +import torch +import torch.fx +from torch.fx.experimental.symbolic_shapes import statically_known_true, sym_eq +from torch.utils import _pytree as pytree +from torch.utils._pytree import tree_map +from .virtualized import V + + +# Check the pattern: (nn.module, F.function/torch.Tensor.method) matched. +# Works for length 2 patterns with 1 module and 1 function/method. +def matches_module_function_pattern( + pattern: Tuple[Type[torch.nn.modules.Module], Callable[..., Any]], + node: torch.fx.node.Node, + modules: Dict[str, torch.nn.modules.Module], +) -> bool: + if len(node.args) == 0: + return False + if not isinstance(node.args[0], torch.fx.Node) or not isinstance( + node, torch.fx.Node + ): + return False + # the first node is call_module + if node.args[0].op != "call_module": + return False + if not isinstance(node.args[0].target, str): + return False + if node.args[0].target not in modules: + return False + if type(modules[node.args[0].target]) is not pattern[0]: + return False + # the second node is call_function or call_method + if node.op != "call_function" and node.op != "call_method": + return False + if node.target != pattern[1]: + return False + # make sure node.args[0] output is only used by current node. + if len(node.args[0].users) > 1: + return False + return True + + +class FakeTensorUpdater: + """ + The main idea here is that it's difficult to maintain accurate fake + tensors (our primary form of metadata) for each node in our graph as we + transform it. + + The most reliable way to obtain this information is by rerunning + faketensor propagation. However, in general, faketensor propagation is + fairly expensive. So, instead we'd like to only rerun faketensor + propagation on nodes that have changed. + + In order to detect which nodes have changed, we first hash its node, + target, and argument lists (which are immutable in FX). + + Then, whenever we call incremental_update, we check which FX nodes have a + new hash, and recompute the faketensor metadata for that node. Then, we + continue to recursively compute the faketensors for all users until the + fake tensors stop changing. + """ + + def __init__(self, graph: torch.fx.Graph): + self.processed_hashes = set() + self.graph = graph + + for node in self.graph.nodes: + self.processed_hashes.add(self.hash_node(node)) + + def hash_node(self, node: torch.fx.Node): + # todo(chilli): Not a great hash function + return (node, node.target, id(node.args), id(node.kwargs)) + + def incremental_update(self): + processed = set() + existing_storages: DefaultDict[Optional[int], int] = defaultdict(int) + for node in self.graph.nodes: + existing_storages[get_node_storage(node)] += 1 + + def is_intlist_same(new, old): + return statically_known_true(sym_eq(new, old)) + + def is_fake_tensor_same(new, old): + if type(new) != type(old): + return False + if isinstance(new, (list, tuple)): + if len(new) != len(old): + return False + return all( + is_fake_tensor_same(new_i, old_i) for new_i, old_i in zip(new, old) + ) + assert isinstance(new, torch.Tensor) + if not is_intlist_same(new.shape, old.shape) or new.layout != old.layout: + return False + if new.layout == torch.strided and ( + not is_intlist_same(new.stride(), old.stride()) + or not statically_known_true( + new.storage_offset() == old.storage_offset() + ) + ): + return False + + if get_storage(new) == get_storage(old): + return True + + # This is the case where it returns a completely fresh storage that's used nowhere else. + if ( + existing_storages[get_storage(old)] == 1 + and get_storage(new) not in existing_storages + ): + return True + return False + + for node in self.graph.nodes: + if self.hash_node(node) in self.processed_hashes: + continue + + def is_aten_node(node): + return node.op == "call_function" and isinstance( + node.target, torch._ops.OpOverload + ) + + if not is_aten_node(node): + continue + + processing = [node] + while len(processing) > 0: + updating_node = processing.pop() + if updating_node in processed: + continue + if is_aten_node(updating_node): + continue + + is_valid, args, kwargs = get_fake_args_kwargs(updating_node) + if not is_valid: + continue + with V.fake_mode: + new_fake_tensor = updating_node.target(*args, **kwargs) + if "val" in updating_node.meta and is_fake_tensor_same( + new_fake_tensor, updating_node.meta["val"] + ): + continue + updating_node.meta["val"] = new_fake_tensor + + # todo(chilli): This code path is not exercised by our existing + # tests - add a test + existing_storages[get_node_storage(new_fake_tensor)] += 1 + processed.add(updating_node) + processing.extend(updating_node.users) + + self.processed_hashes.add(self.hash_node(updating_node)) + + +def get_storage(t: torch.Tensor) -> int: + return t.untyped_storage()._cdata + + +def get_node_storage(node: torch.fx.Node) -> Optional[int]: + if "val" not in node.meta: + return None + if not isinstance(node.meta["val"], torch.Tensor): + return None + if not torch._C._has_storage(node.meta["val"]): + return None + return get_storage(node.meta["val"]) + + +def get_fake(x): + if isinstance(x, torch.fx.Node): + if "val" not in x.meta: + return x + return x.meta["val"] + return x + + +def get_fake_args_kwargs(x: torch.fx.Node) -> Tuple[bool, Tuple[Any], Dict[str, Any]]: + """ + First value returns a boolean if any of the input nodes don't have a faketensor. + """ + args, kwargs = tree_map(get_fake, (x.args, x.kwargs)) + if any( + isinstance(a, torch.fx.Node) for a in pytree.arg_tree_leaves(*args, **kwargs) + ): + return False, args, kwargs + return True, args, kwargs + + +def is_node_realized(node: torch.fx.Node) -> bool: + """Returns true if a node is always realized when lowered to inductor IR. + + NOTE: This may return some false negatives. e.g. it doesn't + handle buffers realized heuristically during lowering, or + buffers realized indirectly through view ops. + """ + from torch._inductor.lowering import fallbacks, needs_realized_inputs + + def is_buffer(node: torch.fx.Node) -> bool: + if node.op == "call_function" and node.target is operator.getitem: + # For nodes with multiple outputs, we get the fx graph: + # foo = torch.ops.aten.foo(...) + # getitem = foo[0] + # getitem_1 = foo[1] + # where we need to check if foo is a fallback kernel + return is_buffer(node.args[0]) # type: ignore[arg-type] + return node.op in ("placeholder", "output") or node.target in fallbacks + + if is_buffer(node): + return True + + def realizes_inputs(node: torch.fx.Node) -> bool: + return node.op == "output" or node.target in needs_realized_inputs + + if any(realizes_inputs(user) for user in node.users): + return True + + # Otherwise, assume node isn't realized + return False diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/index_propagation.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/index_propagation.py new file mode 100644 index 0000000000000000000000000000000000000000..0a02b670982b6157a6989e1b8252c7c2cdc59087 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/index_propagation.py @@ -0,0 +1,277 @@ +"""This file implements the IndexPropagation ops handler, which wraps an +underlying handler to add a limited form of constant propagation, as well as +propagation of sympy expressions downstream of ops.index_expr calls. + +For example, say we have the IR: + + tmp0 = ops.index_expr(x, torch.int32) + tmp1 = ops.constant(2, torch.int32) + tmp2 = ops.mul(tmp0, tmp1) + tmp3 = ops.indirect_indexing(tmp2, x_size) + tmp4 = ops.load("buf0", tmp3) + +The underlying handler would just see: + + ops.load("buf0", x * 2) + +This is limited by the set of operators handled in the sympy expression +printers. So simple operations like minimum and maximum cannot be translated to +SymPy expressions yet, despite sympy.Min and sympy.Max existing. + +""" +import itertools +from dataclasses import dataclass +from typing import Any, Callable, Dict, Literal, Optional, overload, Tuple, Union + +import sympy + +from typing_extensions import TypeAlias + +import torch +from torch._prims_common import is_boolean_dtype, is_integer_dtype +from torch.utils._sympy.functions import FloorDiv, ModularIndexing, Where + + +@dataclass +class TypedExpr: + """A SymPy expression with associated type""" + + expr: sympy.Expr + dtype: torch.dtype + + +class SymPyOps: + """An ops handler where all IR values are SymPy expressions + + When a value cannot be represented as a SymPy expression, the method is + either not defined, or returns NotImplemented + + """ + + @staticmethod + def identity(value: Any) -> Any: + return value + + @staticmethod + def constant(value: Union[int, float, bool], dtype: torch.dtype) -> TypedExpr: + if is_boolean_dtype(dtype): + expr = sympy.Integer(bool(value)) + elif is_integer_dtype(dtype): + expr = sympy.Integer(int(value)) + else: + expr = sympy.Float(float(value)) + return TypedExpr(expr, dtype) + + @staticmethod + def index_expr(value: sympy.Expr, dtype: torch.dtype) -> Union[int, TypedExpr]: + if isinstance(value, int): + value = sympy.Integer(value) + return TypedExpr(value, dtype) + + @staticmethod + def to_dtype( + value: Any, dtype: torch.dtype, src_dtype: Optional[torch.dtype] = None + ) -> Union[int, TypedExpr]: + if isinstance(value.expr, (sympy.Integer, sympy.Float)): + return SymPyOps.constant(value.expr, dtype) + elif is_integer_dtype(dtype) and is_integer_dtype(value.dtype): + return SymPyOps.index_expr(value.expr, dtype) + else: + # TODO: Inductor doesn't handle floating point in sympy expressions well at the moment + return NotImplemented + + @staticmethod + def square(x: TypedExpr) -> TypedExpr: + return TypedExpr(x.expr * x.expr, x.dtype) + + @staticmethod + def add(x: TypedExpr, y: TypedExpr) -> TypedExpr: + result_type = torch.promote_types(x.dtype, y.dtype) + return TypedExpr(x.expr + y.expr, result_type) + + @staticmethod + def sub(x: TypedExpr, y: TypedExpr) -> TypedExpr: + result_type = torch.promote_types(x.dtype, y.dtype) + return TypedExpr(x.expr - y.expr, result_type) + + @staticmethod + def mul(x: TypedExpr, y: TypedExpr) -> TypedExpr: + result_type = torch.promote_types(x.dtype, y.dtype) + return TypedExpr(x.expr * y.expr, result_type) + + @staticmethod + def neg(x: TypedExpr) -> TypedExpr: + return TypedExpr(-x.expr, x.dtype) + + @staticmethod + def floordiv(x: TypedExpr, y: TypedExpr) -> TypedExpr: + result_type = torch.promote_types(x.dtype, y.dtype) + if not is_integer_dtype(result_type): + return NotImplemented + + return TypedExpr(FloorDiv(x.expr, y.expr), result_type) + + @staticmethod + def mod(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]: + result_type = torch.promote_types(x.dtype, y.dtype) + if not is_integer_dtype(result_type): + return NotImplemented + + result_expr = ModularIndexing(x.expr, sympy.Integer(1), y.expr) + return TypedExpr(result_expr, result_type) + + @staticmethod + def remainder(x: TypedExpr, y: TypedExpr) -> Optional[TypedExpr]: + result_type = torch.promote_types(x.dtype, y.dtype) + if not is_integer_dtype(result_type): + return NotImplemented + # In these cases, remainder in Python == remainder in C++, so this transformation + # is sound + if ( + x.expr.is_nonnegative is not None + and x.expr.is_nonnegative == y.expr.is_positive + ): + result_expr = ModularIndexing(x.expr, sympy.Integer(1), y.expr) + return TypedExpr(result_expr, result_type) + return NotImplemented + + @staticmethod + def minimum(x: TypedExpr, y: TypedExpr) -> TypedExpr: + result_type = torch.promote_types(x.dtype, y.dtype) + return TypedExpr(sympy.Min(x.expr, y.expr), result_type) + + @staticmethod + def maximum(x: TypedExpr, y: TypedExpr) -> TypedExpr: + result_type = torch.promote_types(x.dtype, y.dtype) + return TypedExpr(sympy.Max(x.expr, y.expr), result_type) + + +@dataclass +class IndexPropVar: + value: Any # Either an IR value, or TypedExpr if is_symbolic is true + is_symbolic: bool = False + + @staticmethod + def new_symbolic(expr: TypedExpr) -> "IndexPropVar": + return IndexPropVar(expr, is_symbolic=True) + + def __post_init__(self): + assert not self.is_symbolic or isinstance( + self.value, TypedExpr + ), "Symbolic IndexPropVar must contain a TypedExpr" + + +IndexPropResult: TypeAlias = Union[IndexPropVar, Tuple["IndexPropResult", ...]] + + +class IndexPropagation: + """Ops wrapper that tries to propagate constant and index_expr values through the computation. + + This aims to maximize the compile time simplification possible, and convert + indirect indexing from arange into normal static indexing. + + """ + + def __init__(self, inner: Any): + self._inner = inner + + def materialize_expr(self, expr: sympy.Expr, dtype: torch.dtype) -> Any: + # Construct a new constant/index_expr from the SymPy expression + if isinstance(expr, sympy.Integer): + return self._inner.constant(int(expr), dtype) + elif expr.is_number: + return self._inner.constant(float(expr), dtype) + return self._inner.index_expr(expr, dtype) + + def unwrap(self, a: Union[Any, IndexPropVar]) -> Any: + if isinstance(a, (list, tuple)): + return tuple(self.unwrap(v) for v in a) + + if not isinstance(a, IndexPropVar): + return a + + # Prefer the sympy representation if possible + if a.is_symbolic: + return self.materialize_expr(a.value.expr, a.value.dtype) + + return a.value + + def wrap(self, a) -> IndexPropResult: + if isinstance(a, (list, tuple)): + return tuple(self.wrap(v) for v in a) + return IndexPropVar(a) + + @overload + def fallback( + self, + name: Literal["indirect_indexing"], + args: Tuple[Any, ...], + kwargs: Dict[str, Any], + ) -> IndexPropVar: + ... + + @overload + def fallback( + self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any] + ) -> IndexPropResult: + ... + + def fallback( + self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any] + ) -> IndexPropResult: + # Fallback to the wrapped handler + new_args = [self.unwrap(a) for a in args] + new_kwargs = {k: self.unwrap(v) for k, v in kwargs.items()} + return self.wrap(getattr(self._inner, name)(*new_args, **new_kwargs)) + + def propagate_sympy( + self, name: str, args: Tuple[Any, ...], kwargs: Dict[str, Any] + ) -> IndexPropResult: + # Build a new SymPy expression from this ops call + def unwrap(a: Union[Any, IndexPropVar]) -> Any: + if not isinstance(a, IndexPropVar): + return a + return a.value + + new_args = [unwrap(a) for a in args] + new_kwargs = {k: unwrap(v) for k, v in kwargs.items()} + new_expr = getattr(SymPyOps, name)(*new_args, **new_kwargs) + is_valid_expr = new_expr is not NotImplemented and ( + # Inductor doesn't expect floating point in sympy expressions, but + # allow floating point constants to be propagated + isinstance(new_expr.expr, sympy.Number) + or new_expr.expr.is_integer + ) + if not is_valid_expr: + return self.fallback(name, args, kwargs) + return IndexPropVar.new_symbolic(new_expr) + + def __getattr__(self, name: str) -> Callable[..., IndexPropResult]: + def inner(*args: Any, **kwargs: Any) -> IndexPropResult: + if not hasattr(SymPyOps, name): + return self.fallback(name, args, kwargs) + + var_arguments = [ + a + for a in itertools.chain(args, kwargs.values()) + if isinstance(a, IndexPropVar) + ] + if not all(v.is_symbolic for v in var_arguments): + return self.fallback(name, args, kwargs) + + return self.propagate_sympy(name, args, kwargs) + + return inner + + def indirect_indexing( + self, index: Union[Any, IndexPropVar], size: Any, check: bool = True + ) -> Any: + # nb. We do index + Where(...) rather than Where(idx >= 0, idx, idx + sz) because we don't have CSE + # for SymPy expressions, so we don't want to repeat idx too much + + # indirect_indexing returns a sympy value, so no need to wrap in IndexPropVar here + if isinstance(index, IndexPropVar) and index.is_symbolic: + # If we are turning a indirect indexing into direct, we need to wrap it. + index = index.value.expr + return index + Where(index >= 0, 0, size) + return self.fallback("indirect_indexing", (index, size, check), {}).value diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/metrics.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..c9a2fcca5f2fb9bbb2118b5b833be61670ee6d3d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/metrics.py @@ -0,0 +1,419 @@ +from __future__ import annotations + +import csv +import inspect +import os +import re +from dataclasses import dataclass +from functools import lru_cache + +from typing import Dict, List, Set, Tuple, TYPE_CHECKING, Union + +from torch._inductor import config +from torch._inductor.utils import get_benchmark_name + +# Prevent circular import +if TYPE_CHECKING: + from torch._inductor.scheduler import ( + BaseSchedulerNode, + ExternKernelSchedulerNode, + NopKernelSchedulerNode, + SchedulerNode, + ) + +# counter for tracking how many kernels have been generated +generated_kernel_count = 0 +generated_cpp_vec_kernel_count = 0 +num_bytes_accessed = 0 +nodes_num_elem: List[ + Tuple[ + Union[NopKernelSchedulerNode, SchedulerNode, ExternKernelSchedulerNode], + int, + ] +] = [] +node_runtimes: List[Tuple[BaseSchedulerNode, float]] = [] + +# counters for tracking fusions +ir_nodes_pre_fusion = 0 + +# counters for tracking to_dtype inserted +cpp_to_dtype_count = 0 + +# counters for tracking cpp_wrapper disabled +disable_cpp_wrapper = 0 + + +# reset all counters +def reset(): + global generated_kernel_count + global generated_cpp_vec_kernel_count + global num_bytes_accessed, nodes_num_elem + global ir_nodes_pre_fusion + global cpp_to_dtype_count + global disable_cpp_wrapper + + generated_kernel_count = 0 + generated_cpp_vec_kernel_count = 0 + num_bytes_accessed = 0 + nodes_num_elem.clear() + node_runtimes.clear() + ir_nodes_pre_fusion = 0 + cpp_to_dtype_count = 0 + disable_cpp_wrapper = 0 + + +@dataclass +class CachedMetricsDeltas: + """ + The subset of metrics we want update across cache hits, e.g., the + FxGraphCache. + """ + + generated_kernel_count: int + generated_cpp_vec_kernel_count: int + ir_nodes_pre_fusion: int + cpp_to_dtype_count: int + + +class CachedMetricsHelper: + """ + A helper class to help calculate and apply counter deltas for those + metrics we want to save with cache entries (e.g., FxGraphCache) and + apply on a cache hit. + """ + + def __init__(self): + global generated_kernel_count + global generated_cpp_vec_kernel_count + global ir_nodes_pre_fusion + global cpp_to_dtype_count + + self.generated_kernel_count = generated_kernel_count + self.generated_cpp_vec_kernel_count = generated_cpp_vec_kernel_count + self.ir_nodes_pre_fusion = ir_nodes_pre_fusion + self.cpp_to_dtype_count = cpp_to_dtype_count + + def get_deltas(self) -> CachedMetricsDeltas: + global generated_kernel_count + global generated_cpp_vec_kernel_count + global ir_nodes_pre_fusion + global cpp_to_dtype_count + + return CachedMetricsDeltas( + generated_kernel_count - self.generated_kernel_count, + generated_cpp_vec_kernel_count - self.generated_cpp_vec_kernel_count, + ir_nodes_pre_fusion - self.ir_nodes_pre_fusion, + cpp_to_dtype_count - self.cpp_to_dtype_count, + ) + + @staticmethod + def apply_deltas(delta: CachedMetricsDeltas): + global generated_kernel_count + global generated_cpp_vec_kernel_count + global ir_nodes_pre_fusion + global cpp_to_dtype_count + + generated_kernel_count += delta.generated_kernel_count + generated_cpp_vec_kernel_count += delta.generated_cpp_vec_kernel_count + ir_nodes_pre_fusion += delta.ir_nodes_pre_fusion + cpp_to_dtype_count += delta.cpp_to_dtype_count + + +REGISTERED_METRIC_TABLES: Dict[str, MetricTable] = {} + + +@dataclass +class MetricTable: + table_name: str + column_names: List[str] + + num_rows_added: int = 0 + + def add_row(self, row_fn): + if self.table_name not in enabled_metric_tables(): + return + + row_dict = row_fn() + assert len(self.column_names) == len( + row_dict + ), f"{len(self.column_names)} v.s. {len(row_dict)}" + assert set(self.column_names) == set( + row_dict.keys() + ), f"{set(self.column_names)} v.s. {set(row_dict.keys())}" + + row = [ + get_benchmark_name(), + ] + row += [row_dict[column_name] for column_name in self.column_names] + self._write_row(row) + + def output_filename(self): + return f"metric_table_{self.table_name}.csv" + + def write_header(self): + filename = self.output_filename() + with open(filename, "w") as fd: + writer = csv.writer(fd, lineterminator="\n") + writer.writerow(["model_name"] + self.column_names) + + def _write_row(self, row): + filename = self.output_filename() + if self.num_rows_added == 0 and not os.path.exists(filename): + self.write_header() + + self.num_rows_added += 1 + + for idx, orig_val in enumerate(row): + if isinstance(orig_val, float): + new_val = f"{orig_val:.6f}" + elif orig_val is None: + new_val = "" + else: + new_val = orig_val + row[idx] = new_val + + with open(filename, "a") as fd: + writer = csv.writer(fd, lineterminator="\n") + writer.writerow(row) + + @staticmethod + def register_table(name, column_names): + table = MetricTable(name, column_names) + REGISTERED_METRIC_TABLES[name] = table + + +MetricTable.register_table( + "slow_fusion", + [ + "kernel1_path", + "kernel1_latency", + "kernel2_path", + "kernel2_latency", + "fused_kernel_path", + "fused_kernel_latency", + "slow_down_ratio", + ], +) + +# track the fusion statistics for each graph +MetricTable.register_table( + "graph_stats", + [ + "graph_id", + "num_nodes_before_fusion", + "num_nodes_after_fusion", + ], +) + +# track the perf difference between persistent reduction and non-persistent +# reductions +MetricTable.register_table( + "persistent_red_perf", + [ + "kernel1_name", + "kernel2_name", + "kernel1_latency", + "kernel2_latency", + "size_hints", + "reduction_hint", + "speedup", + ], +) + +# Log metadata for pointwise/reduction kernels. E.g., model name, kernel path, numel, rnumel, reduction hint +MetricTable.register_table( + "kernel_metadata", + [ + "kernel_name", + "kernel_path", + "kernel_category", # pointwise/reduction/foreach etc. + "size_hints", + "reduction_hint", + "line_of_code", + "num_load", + "num_store", + "num_for_loop", + "num_atomic_add", + "num_args", + # xyz numel can be different to size_hints since size_hints are rounded + # up to the nearest power of 2. + # Inductor kernel will burn in the xyz numel in kernel code for static + # shape kernels. + # Logging them will be helpful to find unaligned shape for reduction + "xnumel", + "ynumel", + "rnumel", + "kernel_args_num_gb", + ], +) + + +def _parse_kernel_fn_code(kernel_module_code): + """ + The kernel_module_code is the python module that contains kernel function code. + kernel function is the proper triton kernel function annotated with + @triton.jit + """ + from .codecache import PyCodeCache + from .wrapper_benchmark import get_triton_kernel + + mod = PyCodeCache.load(kernel_module_code) + kernel = get_triton_kernel(mod) + # kernel is a CachingAutotune; kernel.fn is the JITFunction; + # kernel.fn.fn is the function being decorate by triton.jit + return inspect.getsource(kernel.fn.fn) + + +def _parse_kernel_line_of_code(proper_kernel_fn_code): + """ + Return the line of code for the kernel excluding the decorators. + """ + return len(proper_kernel_fn_code.splitlines()) + + +def _parse_size_hints(kernel_module_code, kernel_category): + if kernel_category == "foreach": + # foreach kernel does not have size_hints + return None + m = re.search(r"size_hints=(\[[0-9, ]*\]),", kernel_module_code) + assert m, "size_hints missing!" + return m.group(1) + + +def _parse_reduction_hint(kernel_category, kernel_module_code): + if kernel_category not in ("reduction", "persistent_reduction"): + return None + m = re.search(r"reduction_hint=ReductionHint\.(\w*),", kernel_module_code) + assert m, "reduction_hint not found in kernel source code!" + return m.group(1) + + +def _count_pattern(proper_kernel_fn_code, pattern): + return proper_kernel_fn_code.count(pattern) + + +def _count_args(proper_kernel_fn_code): + def_line = proper_kernel_fn_code.splitlines()[0] + assert def_line.startswith("def ") + start_idx = def_line.index("(") + end_idx = def_line.index("):") + decl_csv = def_line[start_idx + 1 : end_idx] + comps = decl_csv.split(",") + return len(comps) + + +def _parse_proper_kernel_fn_code(kernel_fn_code): + """ + Skip decorators. + """ + start_pos = kernel_fn_code.index("def ") + return kernel_fn_code[start_pos:] + + +def _parse_numel(proper_kernel_fn_code, numel_arg_name): + m = re.search(f"{numel_arg_name} = ([\\d]+)", proper_kernel_fn_code) + if m: + return int(m.group(1)) + else: + return None + + +def _parse_kernel_args_num_gb(kernel_fn_code, kernel_category): + """ + inductor meta looks like: + inductor_meta={... 'mutated_arg_names': [], 'no_x_dim': False, 'kernel_num_gb': 2.0}, + """ + m = re.search(r".kernel_num_gb.:\s*([0-9.]+)", kernel_fn_code) + if m: + return float(m.group(1)) + else: + """ + There are a few cases that kernel_num_gdb field can be missing: + 1. the field will be missing if config.benchmark_kernel and + config.profile_bandwidth are false + 2. even if config.benchmark_kernel or config.profile_bandwidth is true. + foreach kernel does not have kernel_num_gb field in the metadata + """ + return None + + +def log_kernel_metadata(kernel_name, kernel_path, kernel_module_code): + """ + An utility to log kernel metadata. We may parse metadata from kernel source code here. + + It's fine to parse the generated kernel code here since the logging is + disabled by default. It would hurt compilation time. + """ + from .wrapper_benchmark import get_kernel_category_by_source_code + + kernel_category = get_kernel_category_by_source_code(kernel_module_code) + reduction_hint = _parse_reduction_hint(kernel_category, kernel_module_code) + size_hints = _parse_size_hints(kernel_module_code, kernel_category) + kernel_fn_code = _parse_kernel_fn_code(kernel_module_code) + + proper_kernel_fn_code = _parse_proper_kernel_fn_code(kernel_fn_code) + + # the line of code excluding the decortors + kernel_line_of_code = _parse_kernel_line_of_code(proper_kernel_fn_code) + + get_metric_table("kernel_metadata").add_row( + lambda: { + "kernel_name": kernel_name, + "kernel_path": kernel_path, + "kernel_category": kernel_category, + "size_hints": size_hints, + "reduction_hint": reduction_hint, + "line_of_code": kernel_line_of_code, + "num_load": _count_pattern(proper_kernel_fn_code, "tl.load"), + "num_store": _count_pattern(proper_kernel_fn_code, "tl.store"), + "num_for_loop": _count_pattern(proper_kernel_fn_code, "for "), + "num_atomic_add": _count_pattern(proper_kernel_fn_code, "tl.atomic_add"), + "num_args": _count_args(proper_kernel_fn_code), + "xnumel": _parse_numel(proper_kernel_fn_code, "xnumel"), + "ynumel": _parse_numel(proper_kernel_fn_code, "ynumel"), + "rnumel": _parse_numel(proper_kernel_fn_code, "rnumel"), + "kernel_args_num_gb": _parse_kernel_args_num_gb( + kernel_fn_code, kernel_category + ), + } + ) + + +def purge_old_log_files(): + """ + Purge the old log file at the beginning when the benchmark script runs. + Should do it in the parent process rather than the child processes running + each individual model. + """ + for name, table in REGISTERED_METRIC_TABLES.items(): + if name in enabled_metric_tables(): + filename = table.output_filename() + if os.path.exists(filename): + os.unlink(filename) + + table.write_header() + + +@lru_cache +def enabled_metric_tables() -> Set[str]: + config_str = config.enabled_metric_tables + + enabled = set() + for name in config_str.split(","): + name = name.strip() + if not name: + continue + assert ( + name in REGISTERED_METRIC_TABLES + ), f"Metric table name {name} is not registered" + enabled.add(name) + return enabled + + +def is_metric_table_enabled(name): + return name in enabled_metric_tables() + + +def get_metric_table(name): + assert name in REGISTERED_METRIC_TABLES, f"Metric table {name} is not defined" + return REGISTERED_METRIC_TABLES[name] diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/triton_helpers.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/triton_helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..4f7f3145542b3af36ffc84cb2fc426520e147097 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/triton_helpers.py @@ -0,0 +1,344 @@ +import triton +import triton.language as tl + +# In the latest triton, math functions were shuffled around into different modules: +# https://github.com/openai/triton/pull/3172 +if hasattr(tl.extra.cuda, "libdevice"): + libdevice = tl.extra.cuda.libdevice + math = tl.math +else: + libdevice = tl.math + math = tl + + +@triton.jit +def promote_to_tensor(x): + # Addition promotes to tensor for us + return x + tl.zeros((1,), tl.int1) + + +@triton.jit +def is_floating(x): + return promote_to_tensor(x).dtype.is_floating() + + +@triton.jit +def _prod_accumulate(a, b): + return a * b + + +@triton.jit +def prod(input, axis): + return tl.reduce(input, axis, _prod_accumulate) + + +@triton.jit +def minimum(a, b): + mask = a < b + if is_floating(a): + mask |= a != a + return tl.where(mask, a, b) + + +@triton.jit +def maximum(a, b): + mask = a > b + if is_floating(a): + mask |= a != a + return tl.where(mask, a, b) + + +@triton.jit +def min2(a, dim): + return tl.reduce(a, dim, minimum) + + +@triton.jit +def max2(a, dim): + return tl.reduce(a, dim, maximum) + + +@triton.jit +def minimum_with_index(a_value, a_index, b_value, b_index): + mask = a_value < b_value + equal = a_value == b_value + if is_floating(a_value): + a_isnan = a_value != a_value + b_isnan = b_value != b_value + mask |= a_isnan and not b_isnan + # Consider NaNs as equal + equal |= a_isnan and b_isnan + + # Prefer lowest index if values are equal + mask |= equal & (a_index < b_index) + return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index) + + +@triton.jit +def maximum_with_index(a_value, a_index, b_value, b_index): + mask = a_value > b_value + equal = a_value == b_value + if is_floating(a_value): + a_isnan = a_value != a_value + b_isnan = b_value != b_value + mask |= a_isnan and not b_isnan + # Consider NaNs as equal + equal |= a_isnan and b_isnan + + # Prefer lowest index if values are equal + mask |= equal & (a_index < b_index) + return tl.where(mask, a_value, b_value), tl.where(mask, a_index, b_index) + + +@triton.jit +def min_with_index(value, index, dim): + return tl.reduce((value, index), dim, minimum_with_index) + + +@triton.jit +def max_with_index(value, index, dim): + return tl.reduce((value, index), dim, maximum_with_index) + + +@triton.jit +def welford_reduce(value, mean, m2, weight, first_iteration): + if first_iteration: + new_weight = tl.full(weight.shape, 1, weight.dtype) + new_mean = value + new_m2 = tl.zeros_like(m2) + else: + delta = value - mean + new_weight = weight + 1 + new_mean = mean + delta / new_weight + new_m2 = m2 + delta * (value - new_mean) + return new_mean, new_m2, new_weight + + +@triton.jit +def welford_combine(mean_1, m2_1, weight_1, mean_2, m2_2, weight_2): + delta = mean_2 - mean_1 + new_weight = weight_1 + weight_2 + w2_over_w = tl.where(new_weight == 0.0, 0.0, weight_2 / new_weight) + return ( + mean_1 + delta * w2_over_w, + m2_1 + m2_2 + delta * delta * weight_1 * w2_over_w, + new_weight, + ) + + +@triton.jit +def welford(mean, m2, weight, dim): + return tl.reduce((mean, m2, weight), dim, welford_combine) + + +@triton.jit +def device_assert_then(cond, msg, r): + tl.device_assert(cond, msg) + return r + + +@triton.jit +def randint64(seed, offset, low, high): + r0, r1, r2, r3 = tl.randint4x(seed, offset) + r0 = r0.to(tl.uint64) + r1 = r1.to(tl.uint64) + result = r0 | (r1 << 32) + size = high - low + result = result % size.to(tl.uint64) + result = result.to(tl.int64) + low + return result + + +@triton.jit +def _any_combine(a, b): + return a | b + + +@triton.jit +def any(a, dim): + return tl.reduce(a, dim, _any_combine) + + +@triton.jit +def bucketize_binary_search( + values, # 1D tensor + offsets_ptr, + indexing_dtype, + right, # bool: if true, use intervals closed on the left; see [Note: Inductor bucketize op] + OFFSETS_SIZE: int, + BLOCK_SHAPE, # tuple/list of block shape +): + """ + See [Note: Inductor bucketize op] + """ + + low = tl.zeros(BLOCK_SHAPE, dtype=indexing_dtype) + high = tl.full(BLOCK_SHAPE, OFFSETS_SIZE, dtype=indexing_dtype) + + full_range = OFFSETS_SIZE + 1 + while full_range > 1: + mid = (high + low) // 2 + mask = mid < OFFSETS_SIZE + bucket_upper_bound = tl.load(offsets_ptr + mid, mask=mask) + if right: + is_above = values >= bucket_upper_bound + else: + is_above = values > bucket_upper_bound + + low = tl.where(is_above & mask, mid + 1, low) + high = tl.where(is_above, high, mid) + + full_range = (full_range + 1) // 2 + + return low + + +@triton.jit +def pack_value_flag( + value, + flag, + DTYPE_VALUE_AS_UINT: tl.constexpr, + DTYPE_PACK: tl.constexpr, +): + # Workaround for triton bug, tensor.to doesn't unwrap constexpr values + DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT) + bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth + uv = value.to(DTYPE_VALUE_AS_UINT, bitcast=True).to(DTYPE_PACK) + return flag.to(DTYPE_PACK) | (uv << bitwidth) + + +@triton.jit +def unpack_value( + pack, + DTYPE_VALUE, + DTYPE_VALUE_AS_UINT, +): + # Workaround for triton bug, tensor.to doesn't unwrap constexpr values + DTYPE_VALUE = tl.core._constexpr_to_value(DTYPE_VALUE) + DTYPE_VALUE_AS_UINT = tl.core._constexpr_to_value(DTYPE_VALUE_AS_UINT) + bitwidth = DTYPE_VALUE_AS_UINT.primitive_bitwidth + value_uint = (pack >> bitwidth).to(DTYPE_VALUE_AS_UINT) + return value_uint.to(DTYPE_VALUE, bitcast=True) + + +@triton.jit +def unpack_flag(pack, DTYPE_FLAG): + return pack.to(DTYPE_FLAG) + + +@triton.jit +def exclusive_scan_decoupled_lookback( + scratch_base, + block_value, + index, + combine_fn, + init, + DTYPE_VALUE_AS_UINT: tl.constexpr, + DTYPE_PACK: tl.constexpr, +): + """Compute exclusive scan of a scalar value between blocks + + Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back + + scratch_base: Pointer to scratch space in global memory + block_value: Scalar value for this block + index: Scalar index of this block relative to the current scan + combine_fn: Function ``(value, value) -> value`` which is scanned over + init: Scalar value equal to the identiy of combine_fn + DTYPE_VALUE_AS_UINT: A tl.uint{n} type equal in size to ``block_value`` + DTYPE_PACK: Unsigned type twice the width of block_value + + NOTE: This function is limited to values which are 32-bits or less. + """ + DTYPE_VALUE = block_value.dtype + pack = pack_value_flag( + block_value, + tl.full(block_value.shape, 1, DTYPE_VALUE_AS_UINT), + DTYPE_VALUE_AS_UINT, + DTYPE_PACK, + ) + tl.atomic_xchg(scratch_base + index, pack, sem="relaxed") + + exclusive_prefix = init + test_target = index - 1 + while test_target >= 0: + # tl.atomic_load + flag = tl.full([], 0, DTYPE_VALUE_AS_UINT) + while flag == 0: + pack = tl.atomic_add(scratch_base + test_target, 0, sem="relaxed") + flag = unpack_flag(pack, DTYPE_VALUE_AS_UINT) + + value = unpack_value(pack, DTYPE_VALUE, DTYPE_VALUE_AS_UINT) + exclusive_prefix = combine_fn(value, exclusive_prefix) + + if flag == 2: + test_target = -1 + else: + test_target = test_target - 1 + + # Make inclusive block sum visible to other blocks + inclusive_prefix = combine_fn(exclusive_prefix, block_value) + pack = pack_value_flag( + inclusive_prefix, + tl.full([], 2, DTYPE_VALUE_AS_UINT), + DTYPE_VALUE_AS_UINT, + DTYPE_PACK, + ) + tl.atomic_xchg(scratch_base + index, pack, sem="relaxed") + return exclusive_prefix + + +@triton.jit +def exclusive_scan_decoupled_lookback_64( + scratch_base, block_value, index, combine_fn, init +): + """Compute exclusive scan of a scalar value between blocks + + Ref: https://research.nvidia.com/publication/2016-03_single-pass-parallel-prefix-scan-decoupled-look-back + + scratch_base: Pointer to scratch space in global memory + block_value: Scalar value for this block, must be 64-bits wide + index: Scalar index of this block relative to the current scan + combine_fn: Function ``(value, value) -> value`` which is scanned over + init: Scalar value equal to the identiy of combine_fn + """ + block_value_u64 = block_value.to(tl.uint64, bitcast=True) + tl.store(scratch_base + 3 * index + 1, block_value_u64) + tl.debug_barrier() + flag_one = tl.full([], 1, tl.uint64) + tl.atomic_xchg(scratch_base + 3 * index + 0, flag_one, sem="release") + + exclusive_prefix = init + test_target = index - 1 + while test_target >= 0: + flag = tl.full([], 0, tl.uint64) + while flag == 0: + flag = tl.atomic_add(scratch_base + 3 * test_target + 0, 0, sem="acquire") + + value_u64 = tl.load(scratch_base + 3 * test_target + flag.to(tl.int32)) + value = value_u64.to(block_value.dtype, bitcast=True) + exclusive_prefix = combine_fn(value, exclusive_prefix) + + if flag == 2: + test_target = -1 + else: + test_target = test_target - 1 + + # Make inclusive block sum visible to other blocks + inclusive_prefix = combine_fn(exclusive_prefix, block_value) + inclusive_prefix_u64 = inclusive_prefix.to(tl.uint64, bitcast=True) + tl.store(scratch_base + 3 * index + 2, inclusive_prefix_u64) + tl.debug_barrier() + flag_two = tl.full([], 2, tl.uint64) + tl.atomic_xchg(scratch_base + 3 * index + 0, flag_two, sem="release") + + return exclusive_prefix + + +@triton.jit +def frexp(x): + # TODO(isuruf): use inline_asm_elementwise here + y = libdevice.ilogb(x) + 1 + exponent = tl.where(x == 0, 0, y) + mantissa = tl.where(x == 0, 0, libdevice.ldexp(x, -y)) + return mantissa, exponent diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..bdf19e8d7362272c50d36ff36fcc7c5918a98afb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h @@ -0,0 +1,401 @@ +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +// this implementation is based on CUDACachingAllocator. +// It utilizes Metal Heaps to improve the performance with buffer allocation. +// Do not include this header. Use MPSAllocatorInterface.h instead. +// TODO: Unify the logic with CUDACachingAllocator and remove redundant code. +namespace at::mps::HeapAllocator { + +static const size_t kMaxSmallAlloc = MB(1); // largest "small" allocation is 1 MiB +static const size_t kMinLargeAlloc = MB(10); // allocations between 1 and 10 MiB may use kLargeHeap +static const size_t kRoundLarge = MB(2); // round up large allocations to 2 MiB +static const size_t kSmallHeap = MB(8); // "small" allocations are packed in 8 MiB heaps +static const size_t kLargeHeap = MB(32); // "large" allocations may be packed in 32 MiB heaps +static const size_t kXLargeHeapD = MB(128); // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps +static const size_t kXLargeHeapU = MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps +static const size_t kMaxScalarAlloc = (sizeof(int64_t)); // largest "scalar" allocation + +// buffer pools could be customized with a combination of usage flags +enum UsageFlags : uint32_t { + PRIVATE = 0, + SMALL = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap + SHARED = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device + MANAGED = (1 << 2), // managed storage mode + HAZARD = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool + SCALAR = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream +}; +// debug verbosity flags +enum DebugVerbosity : uint32_t { + SILENT = 0, + PROFILING = (1 << 0), // print generic profiling data for total system memory usage + ALLOCATIONS = (1 << 1), // print buffer allocations + RECYCLES = (1 << 2), // print buffer recycling + RELEASES = (1 << 3), // print buffer releases + LARGE_ONLY = (1 << 4), // only log large buffer pool transactions +}; + +struct HeapBlock; + +struct BufferBlock { + id buffer; + void* cpu_ptr = nullptr; // stores the pointer to CPU mapping of a Shared MTLBuffer + size_t size; // size after alignment + size_t requested_size; // requested size (before alignment) + // buffer shape is used for retrieving base of views in cached graphs + std::vector shape; + bool in_use = false; + HeapBlock* heap; + id_t buf_id; + // counter to candidate least recently used buffers for garbage collection + uint32_t gc_count = 0; + uint32_t use_count = 0; + // counter to assign unique ids to buffer blocks + static uint64_t buffer_counter; + // Metal events used to sync GPU/CPU operations on the shared-storage buffers + MPSEventPtr event; + + BufferBlock(size_t Size, size_t RequestedSize = 0, const id Buffer = nullptr, + HeapBlock* Heap = nullptr) : + buffer(Buffer), size(Size), requested_size(RequestedSize), + heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) { } + + static bool Comparator(const BufferBlock* a, const BufferBlock* b) { + return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer; + } + static size_t alignUp(size_t Size, size_t Alignment) { + assert(((Alignment - 1) & Alignment) == 0); + return ((Size + Alignment - 1) & ~(Alignment - 1)); + } + uint32_t retainCount() const { return [buffer retainCount]; } +}; +typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*); + +struct BufferPool; +struct AllocParams { + AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) : + search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) { } + size_t size() const { return search_key.size; } + + BufferBlock search_key; + BufferPool* pool; + BufferBlock* buffer_block = nullptr; + size_t requested_size; + // true if we exceed the low watermark limit. In this case + // we apply strategies to relieve the pressure before allocation. + bool has_memory_pressure = false; + // true if we're allocating on a unified memory device + bool has_unified_memory = true; +}; + +struct HeapBlock { + id heap; + struct { size_t total, available; } size; + BufferPool* pool; + unsigned int n_buffers = 0; + id_t heap_id; + // indicates if we split this heap to sub-allocate 'several' buffers (otherwise single buffer) + bool is_split; + // counter to assign unique ids to heap blocks + static uint64_t heap_counter; + + HeapBlock(size_t Size, const id Heap = nullptr, BufferPool *Pool = nullptr) : + heap(Heap), size({.total = Size, .available = Size}), pool(Pool), + heap_id(Heap ? ++heap_counter : 0), is_split(true) { } + + static MTLResourceOptions getOptions(uint32_t usage) { + // TODO: check the caching performance of write-combined mode + MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache; + + if (usage & UsageFlags::MANAGED) + options |= MTLResourceStorageModeManaged; + else if (usage & UsageFlags::SHARED) + options |= MTLResourceStorageModeShared; + else + options |= MTLResourceStorageModePrivate; + + options |= (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked; + + return options; + } + + static HeapBlock* createHeapBlock(AllocParams& params, id device, uint32_t usage) { + HeapBlock *heapBlock = nullptr; + bool is_split = true; + const size_t size = params.size(); + MTLHeapDescriptor *d = [MTLHeapDescriptor new]; + if (d) { + const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD; + if (size <= kMaxSmallAlloc) { + d.size = kSmallHeap; + } else if (size < kMinLargeAlloc) { + d.size = kLargeHeap; + } else if (size < kXLargeHeap / 2 && !params.has_memory_pressure) { + d.size = kXLargeHeap; + } else { + d.size = kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge); + is_split = false; + } + d.storageMode = (usage & UsageFlags::SHARED) ? MTLStorageModeShared : MTLStorageModePrivate; + d.cpuCacheMode = MTLCPUCacheModeDefaultCache; + // this automatically handles Metal buffer access synchronizations at the + // cost of slightly lower performance. + d.hazardTrackingMode = (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked; + d.resourceOptions = getOptions(usage); + d.type = MTLHeapTypeAutomatic; + id heap = [device newHeapWithDescriptor: d]; + if (heap) { + [heap setPurgeableState:MTLPurgeableStateNonVolatile]; + const size_t heap_size = heapAvailableSize(heap); + heapBlock = new HeapBlock(heap_size, heap, params.pool); + if (heapBlock) { + heapBlock->is_split = is_split; + } + } + [d release]; + } + return heapBlock; + } + static bool Comparator(const HeapBlock* a, const HeapBlock* b) { + return (a->size.available != b->size.available) ? a->size.available < b->size.available : + (uintptr_t)a->heap < (uintptr_t)b->heap; + } + static NSUInteger heapAvailableSize(id heap, size_t Alignment = vm_page_size) { + return [heap maxAvailableSizeWithAlignment:Alignment]; + } + NSUInteger Size() { + return [heap size]; + } + id newMTLBuffer(size_t length, uint32_t usage) { + id buf = [heap newBufferWithLength:length options:getOptions(usage)]; + if (buf) { + updateAvailableSize(); + n_buffers++; + } + return buf; + } + // returns the retainCount before releasing the buffer + uint32_t releaseMTLBuffer(id& buffer) { + const uint32_t retainCount = [buffer retainCount]; + [buffer release]; + buffer = nil; + updateAvailableSize(); + n_buffers--; + return retainCount; + } + // returns the retainCount before releasing the heap + uint32_t releaseMTLHeap() { + const uint32_t retainCount = [heap retainCount]; + TORCH_INTERNAL_ASSERT(!n_buffers); // assert if heap isn't empty + [heap setPurgeableState:MTLPurgeableStateEmpty]; + [heap release]; + heap = nil; + size.available = 0; + return retainCount; + } + uint32_t retainCount() const { return [heap retainCount]; } + void updateAvailableSize() { size.available = heapAvailableSize(heap); } +}; +typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*); + +struct BufferPool { + enum class Kind { + PRIVATE_SMALL, + PRIVATE_LARGE, + SHARED_SMALL, + SHARED_LARGE, + SCALAR, + }; + + BufferPool(const id Device, uint32_t Usage) : + device(Device), usage(Usage), + heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) { } + + const id device; + // usage flags to customize the pool for various purposes (see UsageFlags enum) + const uint32_t usage; + // total number of buffers in the pool + uint32_t n_buffers = 0; + // total allocations size on this pool + size_t allocated_size = 0; + // total memory available in the pool + size_t available_size = 0; + // list of heaps ordered by their "available" (not total) memory size + std::set heaps; + // list of only "available" buffers in the pool (i.e., buffers not in-use) + std::set available_buffers; + // list of buffers that are in a state of "limbo" where they've already been freed + // from PyTorch-side, but were not returned to pool due to still being + // in-use by command buffers with retainCount > 1. In this state, the buffer is + // neither ready to be recycled, nor could be returned to pool as available. + // These buffers will be returned to pool once the command buffer's + // completionHandler callbacks are called. + std::unordered_set buffers_pending_free; + // list of heaps pending size update + std::unordered_set heaps_pending_update; +}; + +class MPSHeapAllocatorImpl { +public: + explicit MPSHeapAllocatorImpl() : + m_device(at::mps::MPSDevice::getInstance()->device()), + m_max_buffer_size([m_device maxBufferLength]), + m_stream(getDefaultMPSStream()), + m_event_pool(getMPSEventPool()) { + init_allocator(); + } + ~MPSHeapAllocatorImpl() { + emptyCache(); + } + // interface exposed to at::Allocator + id malloc(size_t size, uint32_t usage); + // frees a buffer and returns it into buffer pool + void free(void* ptr); + // releases all the cached buffers and their associated heaps + void emptyCache(); + // free inactive buffers that are pending to be freed + void freeInactiveBuffers(); + // returns true if buffer was allocated from the shared pool + bool isSharedBuffer(const void* ptr); + // get the requested unaligned size of an MTLBuffer + ssize_t getUnalignedBufferSize(const void* ptr); + // set the shape of a base tensor from a view tensor + void setBufferShape(const void* ptr, const IntArrayRef& shape); + // retrieve the shape of a base tensor from a view tensor + IntArrayRef getBufferShape(const void* ptr); + // get the unique ID of the buffer + id_t getBufferId(const void* ptr); + // allocate a buffer from a specialized pool to import CPU scalars into GPU + id allocScalarBufferWithValue(void* value, size_t size); + // returns a CPU-mapping of the input buffer and its retainCount, + // if only it has Shared storage-mode and allocated on MPSAllocator + std::pair getSharedBufferPtr(const void* buffer); + // records events for a list of MTLBuffers (list is used to lock the mutex once) + // returns true if records any event (given if passed buffers exist and are shared-storage) + bool recordEvents(c10::ArrayRef buffers); + // waits for the event to signal the completion of GPU execution + // on the passed shared buffers (list is used to lock the mutex once) + // returns true if actually waited on any event + bool waitForEvents(c10::ArrayRef buffers); + // this indicates how far (in Megabytes) the current total allocations are from the + // low watermark limit which is used to detect if we're under memory pressure + // This returns zero if we've reached the low watermark limit + ssize_t getLowWatermarkValue(); + // (see m_low_watermark_ratio for description) + void setLowWatermarkRatio(double ratio); + // (see m_high_watermark_ratio for description) + void setHighWatermarkRatio(double ratio); + // (see m_low_watermark_limit for description) + size_t getLowWatermarkLimit() const { return m_low_watermark_limit; } + // (see m_max_total_allowed_size for description) + size_t getHighWatermarkLimit() const { return m_max_total_allowed_size; } + // (see m_total_allocated_memory for description) + size_t getTotalAllocatedMemory() const { return m_total_allocated_memory; } + // (see m_current_allocated_memory for description) + size_t getCurrentAllocatedMemory() const { return m_current_allocated_memory; } + // total GPU memory allocated in the process by Metal driver; including + // implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl. + size_t getDriverAllocatedMemory() const { return current_allocated_size(); } + // (see enum DebugVerbosity for description) + uint32_t getDebugVerbosity() const { return m_debug_verbosity; } + // returns the device that we allocate from + inline id Device() const { return m_device; } + + // TODO: make a common function to do size unit conversions in PyTorch. + inline std::string format_size(uint64_t size) const; + +private: + // (see m_high_watermark_ratio for description) + constexpr static double default_high_watermark_ratio = 1.7; + // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize. + constexpr static double default_high_watermark_upper_bound = 2.0; + // (see m_low_watermark_ratio for description) + // on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize + constexpr static double default_low_watermark_ratio_unified = 1.4; + constexpr static double default_low_watermark_ratio_discrete = 1.0; + + const id m_device; + std::recursive_mutex m_mutex; + // allocated buffers by device pointer + ska::flat_hash_map m_allocated_buffers; + // using a container for pools to simplify iterating them + ska::flat_hash_map> m_pools; + // total memory allocated by HeapAllocator (including blocks in pools) + size_t m_total_allocated_memory = 0; + // currently active memory allocations in use (i.e., blocks not in pools) + size_t m_current_allocated_memory = 0; + // max buffer size allowed by Metal + size_t m_max_buffer_size = 0; + // maximum total size allowed to be allocated + size_t m_max_total_allowed_size = 0; + // high watermark ratio is a hard limit for the total allowed allocations + // 0. : disables high watermark limit (may cause system failure if system-wide OOM occurs) + // 1. : recommended maximum allocation size (i.e., device.recommendedMaxWorkingSetSize) + // >1.: allows limits beyond the device.recommendedMaxWorkingSetSize + // e.g., value 0.95 means we allocate up to 95% of recommended maximum + // allocation size; beyond that, the allocations would fail with OOM error. + double m_high_watermark_ratio; + // low watermark ratio is a soft limit to attempt limiting memory allocations up to the lower watermark + // level by garbage collection or committing command buffers more frequently (a.k.a, adaptive commit). + // Value between 0 to m_high_watermark_ratio (setting 0.0 disables adaptive commit and garbage collection) + // e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of recommended maximum + // allocation size. + double m_low_watermark_ratio; + // low watermark size limit (in Bytes) at the time we initialize the allocator + size_t m_low_watermark_limit; + // use "PYTORCH_DEBUG_MPS_ALLOCATOR" env-var to set debug verbosity + uint32_t m_debug_verbosity; + // default MPS stream + MPSStream* m_stream; + // we hold a reference to MPSEventPool so it could get destroyed after MPSAllocator + std::shared_ptr m_event_pool; + + void init_allocator(); + void init_buffer_pools(); + HeapBlock* get_free_heap(AllocParams& params); + bool get_free_buffer(AllocParams& params); + BufferBlock* get_allocated_buffer_block(const void* ptr); + BufferBlock* alloc_buffer_block(size_t size, uint32_t usage); + bool alloc_buffer(AllocParams& params); + void free_buffer(BufferBlock* buffer_block); + // returns true if the container heap is also released + bool release_buffer(BufferBlock* buffer_block, bool remove_empty_heap = true); + void release_buffers(BufferPool& pool); + bool release_available_cached_buffers(AllocParams& params); + bool release_cached_buffers(); + // free unused cached blocks to reclaim GPU memory if memory pressure is high + void garbage_collect_cached_buffers(AllocParams& params); + // returns the suitable buffer pool type for the usage or + // requested/allocated sizes + BufferPool& get_pool(size_t requested_size, size_t aligned_size, uint32_t usage); + // returns the aligned allocation size that is optimized + // for the buffers to get reused frequently + size_t get_allocation_size(size_t size, uint32_t usage) const; + // maximum size of device memory available for allocation in current process + // Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory. + size_t max_device_size() const { return [m_device recommendedMaxWorkingSetSize]; } + // there are implicit allocations from MPS backend, so we need to query the 'device' for + // total allocated size instead of manually tracking in MPSAllocator + size_t current_allocated_size() const { return [m_device currentAllocatedSize]; } + + bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const { + for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) { + MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(buffer_block ? buffer_block->buffer : nullptr, event); + } + return true; + } +}; + +} // namespace at::mps::HeapAllocator diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h new file mode 100644 index 0000000000000000000000000000000000000000..e30a02c3fb213400eb587654b411fca2ee2b06c4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h @@ -0,0 +1,61 @@ +// Copyright © 2023 Apple Inc. + +#pragma once + +#include +#include +#include + +#define MB(x) (x * 1048576UL) + +namespace at::mps { + +// this is a public interface to access MPSAllocator. +// Do not declare methods that would depend on MPS or Metal frameworks. +class IMPSAllocator : public c10::Allocator { +public: + // see the comments in MPSAllocator.h for the description of these methods. + virtual void emptyCache() const = 0; + virtual void freeInactiveBuffers() const = 0; + virtual ssize_t getUnalignedBufferSize(const void* ptr) const = 0; + virtual IntArrayRef getBufferShape(const void* ptr) const = 0; + virtual id_t getBufferId(const void* ptr) const = 0; + virtual void setBufferShape(const void* ptr, const IntArrayRef& shape) const = 0; + virtual bool isSharedBuffer(const void* ptr) const = 0; + virtual bool isSharedStorageSupported() const = 0; + virtual c10::DataPtr allocScalarBufferWithValue(void* value, size_t size) const = 0; + virtual std::string formatSize(size_t size) const = 0; + virtual void setLowWatermarkRatio(double ratio) const = 0; + virtual void setHighWatermarkRatio(double ratio) const = 0; + virtual ssize_t getLowWatermarkValue() const = 0; + virtual size_t getLowWatermarkLimit() const = 0; + virtual size_t getHighWatermarkLimit() const = 0; + virtual size_t getTotalAllocatedMemory() const = 0; + virtual size_t getCurrentAllocatedMemory() const = 0; + virtual size_t getDriverAllocatedMemory() const = 0; + virtual std::pair getSharedBufferPtr(const void* ptr) const = 0; + virtual bool recordEvents(c10::ArrayRef buffers) const = 0; + virtual bool waitForEvents(c10::ArrayRef buffers) const = 0; +}; + +class IMpsAllocatorCallback { + public: + enum class EventType { + ALLOCATED, // buffer got allocated to be used immediately + RECYCLED, // buffer pulled from free list to be reused + FREED, // buffer put to free list for future recycling + RELEASED, // buffer memory released + ALLOCATION_FAILED // buffer allocation failed + }; + virtual ~IMpsAllocatorCallback() = default; + virtual void executeMPSAllocatorCallback(void* ptr, EventType event) = 0; +}; + +// MPS allocator will execute every registered callback when a block of memory is freed. +C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback); +#define REGISTER_MPS_ALLOCATOR_CALLBACK(name, ...) \ + C10_REGISTER_CLASS(MPSAllocatorCallbacksRegistry, name, __VA_ARGS__); + +IMPSAllocator* getIMPSAllocator(bool sharedAllocator = false); + +} // namespace at::mps diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h new file mode 100644 index 0000000000000000000000000000000000000000..880ff1c75d12e17ecf719f3de875c2217f852e51 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h @@ -0,0 +1,100 @@ +// Copyright © 2023 Apple Inc. + +#pragma once + +#include +#include +#include + +namespace at::mps { + +// NOTE: don't create instances of this class directly. +// Use MPSEventPool to acquire instances of MPSEvent. +class MPSEvent { +public: + explicit MPSEvent(id_t ID, MPSStream* stream, bool enable_timing); + ~MPSEvent(); + + // records an event on the stream + void record(bool needsLock, bool syncEvent = false); + // makes all future work submitted to the stream wait for this event. + bool wait(bool needsLock, bool syncEvent = false); + // schedules a notifyListener callback for the event. + bool notify(bool needsLock, MTLSharedEventNotificationBlock block); + // checks if events are already signaled. + bool query() const; + // blocks the CPU thread until all the GPU work that were scheduled + // prior to recording this event are completed. + bool synchronize(); + // resets this event with new parameters in case it gets reused from the event pool + void reset(MPSStream* stream, bool enable_timing); + // returns the unique ID of the event instance + id_t getID() const { return m_id; } + // returns the completion timestamp of the event + uint64_t getCompletionTime() const { return m_completion_time; } + // if already recorded, waits for cpu_sync_cv to be signaled + void waitForCpuSync(); + +private: + id_t m_id; + // enables measuring the completion time of the notifyListener of this event + bool m_enable_timing; + uint64_t m_signalCounter = 0; + MPSStream* m_stream = nullptr; + MTLSharedEvent_t m_event = nullptr; + MTLSharedEventListener* m_listener = nullptr; + // used to sync the events created on this Stream with CPU + std::mutex m_cpu_sync_mutex{}; + std::condition_variable m_cpu_sync_cv{}; + // CondVar predicate to sync the events created on this Stream with CPU + bool m_cpu_sync_completed = false; + // used to compute elapsed time + uint64_t m_completion_time = 0; + + void recordLocked(bool syncEvent); + bool waitLocked(bool syncEvent); + bool notifyLocked(MTLSharedEventNotificationBlock block); + void notifyCpuSync(); + static uint64_t getTime() { + return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW); + } +}; + +typedef std::unique_ptr> MPSEventPtr; + +class MPSEventPool { +public: + explicit MPSEventPool(MPSStream* default_stream); + ~MPSEventPool(); + + MPSEventPtr acquireEvent(bool enable_timing, MPSStream* stream); + void emptyCache(); + + // these are mainly used for MPSHooks and torch.mps.Event() bindings + id_t acquireEvent(bool enable_timing); + void releaseEvent(id_t event_id); + void recordEvent(id_t event_id, bool syncEvent); + void waitForEvent(id_t event_id, bool syncEvent); + void synchronizeEvent(id_t event_id); + bool queryEvent(id_t event_id); + // returns elapsed time between two recorded events in milliseconds + double elapsedTime(id_t start_event_id, id_t end_event_id); + +private: + MPSStream* m_default_stream = nullptr; + std::recursive_mutex m_mutex; + std::stack> m_pool{}; + // dictionary to associate event IDs with event objects + // used to retain in-use events out of the pool + // for torch.mps.Event() bindings. + std::unordered_map m_in_use_events{}; + uint64_t m_event_counter = 0; + std::function m_default_deleter; + + MPSEvent* getInUseEvent(id_t event_id, bool locked = true); +}; + +// shared_ptr is used to get MPSEventPool destroyed after dependent instances +std::shared_ptr getMPSEventPool(); + +} // namespace at::mps diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h new file mode 100644 index 0000000000000000000000000000000000000000..994c50ad9e61c6b0634d80ec3921824f662de59c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h @@ -0,0 +1,393 @@ +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace at::mps { + +namespace Profiler { + +struct BaseInfo { + // profiling info types + enum class Type { + GRAPH, + KERNEL, + COPY, + CPU_FALLBACK, + }; + + BaseInfo(Type infoType, uint64_t Id, const uintptr_t Handle) : + type(infoType), profileId(Id), handle(Handle) { } + virtual ~BaseInfo() = default; + + // type of profiling info + Type type; + // unique profile ID for execution instances of operations or copies + uint64_t profileId; + // ID generated by os_signpost + // since it's possible to use event and interval-based signposts at the + // same time, we need separate IDs for each. + os_signpost_id_t eventSignpostId = 0, intervalSignpostId = 0; + // accumulated GPU time in ms (obtained from CompletionHandler's "GPUEndTime - GPUStartTime") + std::atomic totalGpuTime{0.0}; + // accumulated Scheduling time in ms (obtained from CompletionHandler's "KernelEndTime - KernelStartTime") + std::atomic totalSchedulingTime{0.0}; + // indicates if the operation or copy execution has completed + std::atomic_bool completed{false}; + // handle used to identify the profile info's instance (usually the pointer) + const uintptr_t handle; + + virtual const std::string toString(double gpuTime = 0, double schedulingTime = 0) const; + // builds a string for a tensor (format: Device:ScalarType[tensor.sizes()]) + static std::string buildTensorString(const Tensor& tensor, bool includeBufferId = false) { + if (tensor.defined()) { + std::stringstream tensorStr; + auto deviceType = tensor.device().type(); + tensorStr << c10::DeviceTypeName(deviceType); + // see comments for INCLUDE_BUFFER_ID + if (includeBufferId && deviceType == at::kMPS) { + id buffer = __builtin_bit_cast(id, tensor.storage().data()); + tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) + << ":" << buffer.retainCount << ")"; + } + tensorStr << ":" + << tensor.scalar_type() << tensor.sizes(); + return tensorStr.str(); + } else { + return "undefined"; + } + } + static uint64_t getTime() { + return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW); + } +}; + +struct OperationInfo : BaseInfo { + OperationInfo(const void* Handle, bool IsGraph, uint64_t Id, const std::string& StrKey) : + BaseInfo(IsGraph ? Type::GRAPH : Type::KERNEL, Id, uintptr_t(Handle)), strKey(StrKey) { } + + uint64_t runCount = 0; + std::string strKey; + + const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override; + + // builds a string for a kernel + static std::string buildKernelString(const std::string& kernelName, + const TensorList& tensors, + bool includeBufferId = false) { + std::stringstream kernelStr; + kernelStr << kernelName; + for (const Tensor& tensor: tensors) { + kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId); + } + return kernelStr.str(); + } +}; + +struct CpuFbInfo : BaseInfo { + CpuFbInfo(uint64_t Id, const std::string& OpName) : + BaseInfo(Type::CPU_FALLBACK, Id, 0), opName(OpName) { } + + uint64_t runCount = 0; + // the current and total overhead of copies in bytes required to convert the Op's + // input tensors from MPS to CPU and then output from CPU back to MPS + size_t currentCopyOverhead = 0; + size_t totalCopyOverhead = 0; + std::string opName; + std::string strKey; + uint64_t startTime = 0; + + const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override; + + void updateCopyOverhead(const TensorList& tensors) { + currentCopyOverhead = 0; + for (const Tensor& tensor: tensors) { + if (tensor.defined()) { + currentCopyOverhead += tensor.nbytes(); + } + } + totalCopyOverhead += currentCopyOverhead; + } +}; + +struct CopyInfo : BaseInfo { + enum class Kind { + MPS_TO_MPS, + MPS_TO_CPU, + CPU_TO_MPS, + }; + + CopyInfo(const void* Handle, size_t Length, uint64_t Id, bool IsNonBlocking, bool UsesBlitter) : + BaseInfo(Type::COPY, Id, uintptr_t(Handle)), kind(Kind::MPS_TO_MPS), + length(Length), isNonBlocking(IsNonBlocking), usesBlitter(UsesBlitter) { } + + Kind kind; + size_t length; + bool isNonBlocking; + bool usesBlitter; + std::string srcStrKey; + std::string dstStrKey; + // for copies that don't use blitters, we measure CPU time + uint64_t startTime = 0; + + const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override; + + static std::string buildTensorString(const void* buffer, const OptionalTensorRef tensor, bool includeBufferId = false); + + static bool isStorageOnMPS(const void* buffer, const OptionalTensorRef tensor) { + if (tensor.has_value()) { + return tensor->device().type() == at::kMPS; + } + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(buffer); + // getUnalignedBufferSize() returns -1 if input buffer is not on MPS device + return getIMPSAllocator()->getUnalignedBufferSize(buffer) >= 0; + } + + static Kind getCopyKind(const void* srcBuffer, const void* dstBuffer, + const OptionalTensorRef srcTensor, const OptionalTensorRef dstTensor) { + const bool isSrcOnMPS = isStorageOnMPS(srcBuffer, srcTensor); + const bool isDstOnMPS = isStorageOnMPS(dstBuffer, dstTensor); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isSrcOnMPS || isDstOnMPS); + if (isSrcOnMPS && !isDstOnMPS) { + return Kind::MPS_TO_CPU; + } else if (!isSrcOnMPS && isDstOnMPS) { + return Kind::CPU_TO_MPS; + } + return Kind::MPS_TO_MPS; + } +}; + +struct CopyStat : CopyInfo { + explicit CopyStat(std::string CopyKindStr) : + CopyInfo(nullptr, 0, 0, false, false), kindStr(std::move(CopyKindStr)) {} + // total number of copies + size_t totalCount = 0; + // number of Scalar copies (i.e., less than sizeof(int64)) + size_t scalarsCount = 0; + // number of blocking copies (i.e., require syncing to GPU) + size_t blockingCount = 0; + // number of copies that used memcpy(), instead of Metal Blit Encoder + size_t memcpyCount = 0; + // accumulated GPU time in ms for the scalar copies + std::atomic scalarsGpuTime{0.0}; + // copy kind in string type + std::string kindStr; +}; + +class MPSProfiler { +public: + // lower 16 bits used for profiler options + enum ProfileOptions : uint32_t { + OPTIONS_NONE = 0, + // ALL_* means, all signpost types (RUN_OPERATION|BLIT_COPY|CPU_FALLBACK, etc.) + // (used for convenience to not compute bit flags by OR-ing manually) + // trace all signpost types using events + ALL_SIGNPOST_EVENTS = (1 << 0), + // trace all signpost types using intervals + ALL_SIGNPOST_INTERVALS = (1 << 1), + // always wait for command buffer to finish executing after each commit + WAIT_UNTIL_COMPLETED = (1 << 2), + // for interval-based signposts, include the scheduling portion of + // Graph/Kernel/Copy executions as well. + // if flag is disable, only "GPU run time" is included in interval, + // and not schedule time. + INCLUDE_SCHEDULE_INTERVAL = (1 << 3), + + // use these if you need to trace signposts types individually (rarely required) + // trace signpost using intervals + USE_INTERVALS = (1 << 4), + // trace signpost by emitting events + USE_EVENTS = (1 << 5), + // used for sanity check (Change this when new option added) + OPTIONS_COUNT = (USE_EVENTS << 1) - 1, + }; + + // when adding new types, #define the type string in MPSProfiler.mm as well. + // upper 16 bits used for event types + enum SignpostTypes : uint32_t { + SIGNPOST_NONE = 0, + // trace signposts for PyTorch operation executions + RUN_OPERATION = (1 << 16), + // trace signposts for blitter copies + BLIT_COPY = (1 << 17), + // trace signposts for ops that fall back on CPU + CPU_FALLBACK = (1 << 18), + // used for sanity check (Change this when new type added) + SIGNPOST_COUNT = (CPU_FALLBACK << 1) - 1, + }; + + enum LogOptions : uint32_t { + LOG_NONE = 0, + + // Info logging options during execution + // ------------------------------------- + // prints operation info (id/key/run_count) during execution + OPERATION_INFO = (1 << 0), + // prints copy info (src/dst tensors/buffers, size, etc.) during execution + COPY_INFO = (1 << 1), + // prints CPU Fallback info (id/runCount/opName/copyOverhead) during execution + CPU_FALLBACK_INFO = (1 << 2), + + // Profiling Statistics logging options when process terminates + // ------------------------------------------------------------ + // prints all stats (OPERATION_STATS, COPY_STATS, CPU_FALLBACK_STATS) before process terminates + // this is convenient to not combine following stats bit flags manually + ALL_STATS = (1 << 3), + // prints operation stats (GPU times, run count, etc.) before process terminates + OPERATION_STATS = (1 << 4), + // prints copies stats (GPU times, copy kinds, sizes, etc.) before process terminates + COPY_STATS = (1 << 5), + // prints CPU Fallback stats (CPU times, run times, size of MPS<->CPU copies + // for tensors, etc.) before process terminates + CPU_FALLBACK_STATS = (1 << 6), + + // Metadata format options when logging the info + // --------------------------------------------- + // if enabled, includes GPU run time in metadata (i.e., GPUEndTime-GPUStartTime + // from Metal Command Buffers) (e.g., [GPU=0.324 ms]) + INCLUDE_GPU_TIME = (1 << 7), + // if enabled, includes GPU scheduling time in metadata separately + // (i.e., KernelEndTime-KernelStartTime from Metal Command Buffers) + // e.g., [GPU=0.324 ms, KRNL=0.036 ms] + INCLUDE_KERNEL_TIME = (1 << 8), + // if enabled, includes the unique buffer ID in metadata for the storage + // of a tensor that was allocated on MPSAllocator. This is useful (along with + // the EV "PYTORCH_DEBUG_MPS_ALLOCATOR") to identify buffers that are involved + // with various operations. + INCLUDE_BUFFER_ID = (1 << 9), + + // used for sanity check (Change this when new option added) + LOG_COUNT = (INCLUDE_BUFFER_ID << 1) - 1, + }; + + explicit MPSProfiler(); + ~MPSProfiler(); + + // the handle is either "MPSGraph*" or "id" for Metal Kernels + // the beginProfile*() functions return a profileId which is unique per graph/kernel/copy + uint64_t beginProfileKernel(const void* handle, const std::string& strKey, bool isGraph); + uint64_t beginProfileKernel(const void* handle, const std::string& kernelName, const TensorList& tensors); + uint64_t beginProfileCopy(const void* srcBuffer, const void* dstBuffer, + const OptionalTensorRef srcTensor, + const OptionalTensorRef dstTensor, + size_t length, bool isNonBlocking, bool usesBlitter = true); + uint64_t beginProfileCPUFallback(const std::string& opName, const TensorList& tensors); + void beginProfileGPUInterval(const void* handle); + + void endProfileCopy(uint64_t profileId, SyncType syncType); + void endProfileKernel(const void* handle, SyncType syncType = SyncType::NONE); + void endProfileCPUFallback(const std::string& opName); + + // these are used to hook into Python bindings for torch.mps.profiler module. + // this enables generating OS Signpost traces from MPSProfiler on-demand + // during runtime (instead of environment variables). + // The "mode" could be either "interval", "event", or both "interval,event" + // for interval-based and/or event-based signpost tracing. + void StartTrace(const string& mode, bool waitUntilCompleted); + void StopTrace(); + + // convenience functions to indicate whether signpost tracing or + // logging are enabled for the SignpostTypes + bool isOperationProfilingEnabled() const { + return (m_signpost_types & SignpostTypes::RUN_OPERATION) || + (m_log_options & (LogOptions::OPERATION_INFO | LogOptions::OPERATION_STATS)); + } + bool isCopyProfilingEnabled() const { + return (m_signpost_types & SignpostTypes::BLIT_COPY) || + (m_log_options & (LogOptions::COPY_INFO | LogOptions::COPY_STATS)); + } + bool isCPUFallbackProfilingEnabled() const { + return (m_signpost_types & SignpostTypes::CPU_FALLBACK) || + (m_log_options & (LogOptions::CPU_FALLBACK_INFO | LogOptions::CPU_FALLBACK_STATS)); + } + bool isSignpostTracingEnabled() const { + return (m_signpost_types != SignpostTypes::SIGNPOST_NONE); + } + + private: + // indicates what type of signpost types are enabled and traced by MPS profiler. + uint32_t m_signpost_types = 0; + uint32_t m_profile_options = 0; + uint32_t m_log_options = 0; + uint64_t m_kernel_counter = 0; + uint64_t m_graph_counter = 0; + uint64_t m_cpu_fb_counter = 0; + uint64_t m_copy_counter = 0; + // technically, it's possible to trace both events and intervals at the same time + // so we use separate os_log categories for them + os_log_t m_os_log_events; + os_log_t m_os_log_intervals; + // stats logging could run either from destructor or signal handler + // so this is used to check if logging has already started. + std::atomic_bool hasLoggedStats{false}; + // indicates there are pending completionHandler callbacks that haven't been called yet. + std::atomic_bool hasPendingCompletionHandlers{false}; + // used to capture sigint signal to log profiling stats + static struct sigaction currentSigint, previousSigint; + + // We use the following lists for two reasons: + // 1- for interval-based signposts the "begin" point won't be in same function + // as the "end" point where we need to be able to retrieve signpost's info + // 2- if Operations info need to be logged when process ends using LogOptions::OPERATION_INFO. + + // the pointer key for this map is either "MPSGraph*" or "id" for Metal Kernels + // this list is retained and could be logged along with aggregate profiling numbers when the process ends. + std::unordered_map> m_op_info_list{}; + // the string key for this map is the op name that we fall back to execute on CPU + // this list is retained and could be logged along with aggregate profiling numbers when the process ends. + std::unordered_map> m_cpu_fb_info_list{}; + // this list contains the info for copies, and its key is the unique profileId + // which is generated from m_copy_counter + // The copyInfo list is not retained. + std::unordered_map> m_copy_info_list{}; + // a short list that contains copy stats + std::unordered_map> m_copy_stat_list{}; + + void initialize(); + void beginProfileExecution(BaseInfo& info, bool cpuExecution = false); + void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id, + os_signpost_id_t interval_signpost_id, + double gpuTime, double schedulingTime); + void addProfilerScheduledHandler(BaseInfo& info); + void addProfilerCompletedHandler(BaseInfo& info, SyncType syncType); + void emitSignpostEvent(SignpostTypes signpost_type, os_signpost_id_t signpost_id, + const std::string& msg) const; + void beginSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id, + const std::string& msg) const; + void endSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id) const; + + void updateCopyStats(const CopyInfo& copyInfo, double gpuTime, double schedulingTime); + // returns true if logging the profiling info "during the execution" is enabled + bool isProfileInfoLoggingEnabled(BaseInfo::Type infoType, bool isExecutionEnded); + // logs all the profiling stats that are enabled + void logProfilingStats(); + // logs kernel profiling stats when the process ends. + void logOperationsProfilingStats(std::FILE* f) const; + // logs CPU Fallback profiling stats when the process ends. + void logCPUFallbackProfilingStats(std::FILE* f) const; + // logs copy profiling stats when the process ends. + void logCopyProfilingStats(std::FILE* f) const; + + os_signpost_id_t generateSignpostId(os_signpost_type_t signpostType, const void* ptr = nullptr); + static SignpostTypes getSignpostType(BaseInfo::Type infoType); + static void handleIntSignal(int signal); +}; + +} // namespace Profiler + +Profiler::MPSProfiler& getMPSProfiler(); + +} // namespace at::mps diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h new file mode 100644 index 0000000000000000000000000000000000000000..efbe7ce1b9d1ca650dffe238740e3f08a950f07f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/BatchLinearAlgebra.h @@ -0,0 +1,321 @@ +#pragma once + +#include +#include +#include +#include + +// Forward declare TI +namespace at { +class Tensor; +struct TensorIterator; + +namespace native { +enum class TransposeType; +} + +} + +namespace at::native { + +enum class LapackLstsqDriverType : int64_t { Gels, Gelsd, Gelsy, Gelss}; + +#if AT_BUILD_WITH_LAPACK() +// Define per-batch functions to be used in the implementation of batched +// linear algebra operations + +template +void lapackCholesky(char uplo, int n, scalar_t *a, int lda, int *info); + +template +void lapackCholeskyInverse(char uplo, int n, scalar_t *a, int lda, int *info); + +template +void lapackEig(char jobvl, char jobvr, int n, scalar_t *a, int lda, scalar_t *w, scalar_t* vl, int ldvl, scalar_t *vr, int ldvr, scalar_t *work, int lwork, value_t *rwork, int *info); + +template +void lapackGeqrf(int m, int n, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info); + +template +void lapackOrgqr(int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *work, int lwork, int *info); + +template +void lapackOrmqr(char side, char trans, int m, int n, int k, scalar_t *a, int lda, scalar_t *tau, scalar_t *c, int ldc, scalar_t *work, int lwork, int *info); + +template +void lapackSyevd(char jobz, char uplo, int n, scalar_t* a, int lda, value_t* w, scalar_t* work, int lwork, value_t* rwork, int lrwork, int* iwork, int liwork, int* info); + +template +void lapackGels(char trans, int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + scalar_t *work, int lwork, int *info); + +template +void lapackGelsd(int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + value_t *s, value_t rcond, int *rank, + scalar_t* work, int lwork, + value_t *rwork, int* iwork, int *info); + +template +void lapackGelsy(int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + int *jpvt, value_t rcond, int *rank, + scalar_t *work, int lwork, value_t* rwork, int *info); + +template +void lapackGelss(int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + value_t *s, value_t rcond, int *rank, + scalar_t *work, int lwork, + value_t *rwork, int *info); + +template +struct lapackLstsq_impl; + +template +struct lapackLstsq_impl { + static void call( + char trans, int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + scalar_t *work, int lwork, int *info, // Gels flavor + int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor + value_t *s, // Gelss flavor + int *iwork // Gelsd flavor + ) { + lapackGels( + trans, m, n, nrhs, + a, lda, b, ldb, + work, lwork, info); + } +}; + +template +struct lapackLstsq_impl { + static void call( + char trans, int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + scalar_t *work, int lwork, int *info, // Gels flavor + int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor + value_t *s, // Gelss flavor + int *iwork // Gelsd flavor + ) { + lapackGelsy( + m, n, nrhs, + a, lda, b, ldb, + jpvt, rcond, rank, + work, lwork, rwork, info); + } +}; + +template +struct lapackLstsq_impl { + static void call( + char trans, int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + scalar_t *work, int lwork, int *info, // Gels flavor + int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor + value_t *s, // Gelss flavor + int *iwork // Gelsd flavor + ) { + lapackGelsd( + m, n, nrhs, + a, lda, b, ldb, + s, rcond, rank, + work, lwork, + rwork, iwork, info); + } +}; + +template +struct lapackLstsq_impl { + static void call( + char trans, int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + scalar_t *work, int lwork, int *info, // Gels flavor + int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor + value_t *s, // Gelss flavor + int *iwork // Gelsd flavor + ) { + lapackGelss( + m, n, nrhs, + a, lda, b, ldb, + s, rcond, rank, + work, lwork, + rwork, info); + } +}; + +template +void lapackLstsq( + char trans, int m, int n, int nrhs, + scalar_t *a, int lda, scalar_t *b, int ldb, + scalar_t *work, int lwork, int *info, // Gels flavor + int *jpvt, value_t rcond, int *rank, value_t* rwork, // Gelsy flavor + value_t *s, // Gelss flavor + int *iwork // Gelsd flavor + ) { + lapackLstsq_impl::call( + trans, m, n, nrhs, + a, lda, b, ldb, + work, lwork, info, + jpvt, rcond, rank, rwork, + s, + iwork); +} + +template +void lapackLuSolve(char trans, int n, int nrhs, scalar_t *a, int lda, int *ipiv, scalar_t *b, int ldb, int *info); + +template +void lapackLu(int m, int n, scalar_t *a, int lda, int *ipiv, int *info); + +template +void lapackLdlHermitian( + char uplo, + int n, + scalar_t* a, + int lda, + int* ipiv, + scalar_t* work, + int lwork, + int* info); + +template +void lapackLdlSymmetric( + char uplo, + int n, + scalar_t* a, + int lda, + int* ipiv, + scalar_t* work, + int lwork, + int* info); + +template +void lapackLdlSolveHermitian( + char uplo, + int n, + int nrhs, + scalar_t* a, + int lda, + int* ipiv, + scalar_t* b, + int ldb, + int* info); + +template +void lapackLdlSolveSymmetric( + char uplo, + int n, + int nrhs, + scalar_t* a, + int lda, + int* ipiv, + scalar_t* b, + int ldb, + int* info); + +template +void lapackSvd(char jobz, int m, int n, scalar_t *a, int lda, value_t *s, scalar_t *u, int ldu, scalar_t *vt, int ldvt, scalar_t *work, int lwork, value_t *rwork, int *iwork, int *info); +#endif + +#if AT_BUILD_WITH_BLAS() +template +void blasTriangularSolve(char side, char uplo, char trans, char diag, int n, int nrhs, scalar_t* a, int lda, scalar_t* b, int ldb); +#endif + +using cholesky_fn = void (*)(const Tensor& /*input*/, const Tensor& /*info*/, bool /*upper*/); +DECLARE_DISPATCH(cholesky_fn, cholesky_stub); + +using cholesky_inverse_fn = Tensor& (*)(Tensor& /*result*/, Tensor& /*infos*/, bool /*upper*/); + +DECLARE_DISPATCH(cholesky_inverse_fn, cholesky_inverse_stub); + +using linalg_eig_fn = void (*)(Tensor& /*eigenvalues*/, Tensor& /*eigenvectors*/, Tensor& /*infos*/, const Tensor& /*input*/, bool /*compute_eigenvectors*/); + +DECLARE_DISPATCH(linalg_eig_fn, linalg_eig_stub); + +using geqrf_fn = void (*)(const Tensor& /*input*/, const Tensor& /*tau*/); +DECLARE_DISPATCH(geqrf_fn, geqrf_stub); + +using orgqr_fn = Tensor& (*)(Tensor& /*result*/, const Tensor& /*tau*/); +DECLARE_DISPATCH(orgqr_fn, orgqr_stub); + +using ormqr_fn = void (*)(const Tensor& /*input*/, const Tensor& /*tau*/, const Tensor& /*other*/, bool /*left*/, bool /*transpose*/); +DECLARE_DISPATCH(ormqr_fn, ormqr_stub); + +using linalg_eigh_fn = void (*)( + const Tensor& /*eigenvalues*/, + const Tensor& /*eigenvectors*/, + const Tensor& /*infos*/, + bool /*upper*/, + bool /*compute_eigenvectors*/); +DECLARE_DISPATCH(linalg_eigh_fn, linalg_eigh_stub); + +using lstsq_fn = void (*)( + const Tensor& /*a*/, + Tensor& /*b*/, + Tensor& /*rank*/, + Tensor& /*singular_values*/, + Tensor& /*infos*/, + double /*rcond*/, + std::string /*driver_name*/); +DECLARE_DISPATCH(lstsq_fn, lstsq_stub); + +using triangular_solve_fn = void (*)( + const Tensor& /*A*/, + const Tensor& /*B*/, + bool /*left*/, + bool /*upper*/, + TransposeType /*transpose*/, + bool /*unitriangular*/); +DECLARE_DISPATCH(triangular_solve_fn, triangular_solve_stub); + +using lu_factor_fn = void (*)( + const Tensor& /*input*/, + const Tensor& /*pivots*/, + const Tensor& /*infos*/, + bool /*compute_pivots*/); +DECLARE_DISPATCH(lu_factor_fn, lu_factor_stub); + +using unpack_pivots_fn = void(*)( + TensorIterator& iter, + const int64_t dim_size, + const int64_t max_pivot); +DECLARE_DISPATCH(unpack_pivots_fn, unpack_pivots_stub); + +using lu_solve_fn = void (*)( + const Tensor& /*LU*/, + const Tensor& /*pivots*/, + const Tensor& /*B*/, + TransposeType /*trans*/); +DECLARE_DISPATCH(lu_solve_fn, lu_solve_stub); + +using ldl_factor_fn = void (*)( + const Tensor& /*LD*/, + const Tensor& /*pivots*/, + const Tensor& /*info*/, + bool /*upper*/, + bool /*hermitian*/); +DECLARE_DISPATCH(ldl_factor_fn, ldl_factor_stub); + +using svd_fn = void (*)( + const Tensor& /*A*/, + const bool /*full_matrices*/, + const bool /*compute_uv*/, + const c10::optional& /*driver*/, + const Tensor& /*U*/, + const Tensor& /*S*/, + const Tensor& /*Vh*/, + const Tensor& /*info*/); +DECLARE_DISPATCH(svd_fn, svd_stub); + +using ldl_solve_fn = void (*)( + const Tensor& /*LD*/, + const Tensor& /*pivots*/, + const Tensor& /*result*/, + bool /*upper*/, + bool /*hermitian*/); +DECLARE_DISPATCH(ldl_solve_fn, ldl_solve_stub); +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h new file mode 100644 index 0000000000000000000000000000000000000000..c2e61f280bf5922e911cdbc180a3503d75d8a06f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/EmbeddingBag.h @@ -0,0 +1,139 @@ +#include +#include +#include + +#ifdef USE_FBGEMM +#include +#endif + +namespace at::native { + +void check_arguments( + const Tensor& weight, + const Tensor& indices, + const Tensor& offsets, + const int64_t mode, + const c10::optional& per_sample_weights, + bool include_last_offset); + +void make_bag_size_out( + Tensor& bag_size_out, + const Tensor& offsets, + const Tensor& indices, + const int64_t mode, + const bool include_last_offset, + const bool requires_grad); + +void make_max_indices_out( + Tensor& max_indices_out, + const Tensor& weight, + const Tensor& indices, + const Tensor& offsets, + const Tensor& bag_size, + const int64_t mode, + bool include_last_offset); + +void make_offset2bag_out( + Tensor& offset2bag, + Tensor& output, + const Tensor& weight, + const Tensor& indices, + const Tensor& offsets, + const int64_t mode, + const c10::optional& per_sample_weights, + const int64_t padding_idx = -1); + +#ifdef USE_FBGEMM + +template +struct _CallbackAndBlockSize { + using TCallback = typename fbgemm::EmbeddingSpMDMKernelSignature::Type; + + int64_t blockSize = -1; + TCallback callback = nullptr; + + static TCallback generateCallback(int64_t block_size) { + return fbgemm::GenerateEmbeddingSpMDM( + block_size, + has_weight, + /* normalize_by_lengths */false, + /* prefetch */16, + /* is_weight_positional */false, + /* use_offsets */true); + } + + _CallbackAndBlockSize() = default; + + explicit _CallbackAndBlockSize(c10::optional maybe_block_size) + : blockSize(maybe_block_size.value_or(-1)) + , callback(maybe_block_size.has_value() ? generateCallback(maybe_block_size.value()) : nullptr) + {} +}; + +template +struct _EmbeddingBagKernelCacheImpl : private StorageMixins... { + + _EmbeddingBagKernelCacheImpl() = default; + // use each of the mixins to store corresponding kernel and block size + explicit _EmbeddingBagKernelCacheImpl(c10::optional maybe_block_size) + : StorageMixins(maybe_block_size)... + {} + + // this method is thread safe (call sites may call from different threads) + template + typename _CallbackAndBlockSize::TCallback + getCallback(int64_t block_size) const { + // if the cache doesn't store the kernel for the incoming block size + // (so it is different from the one stored in corresponding mixin) + // regenerate the kernel (not writing it into the cache so we avoid locks) + if (block_size != _CallbackAndBlockSize::blockSize) { + return _CallbackAndBlockSize::generateCallback(block_size); + } + // else retrieve the cached kernel from the corresponding mixin + return _CallbackAndBlockSize::callback; + } +}; + +// instantiate the cache with the list of storage mixins +// for each of the 8 _EmbeddingBagKernelCache* usages in the EmbeddingBag.cpp impl file +using _EmbeddingBagKernelCache = _EmbeddingBagKernelCacheImpl< + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize, + _CallbackAndBlockSize>; +#else +struct _EmbeddingBagKernelCache { + explicit _EmbeddingBagKernelCache(c10::optional /* maybe_block_size */) {} +}; +#endif + +void _embedding_bag_cpu_impl_out(Tensor& output, Tensor& offset2bag, + Tensor& bag_size, Tensor* max_indices, + const Tensor &weight, const Tensor &indices, + const Tensor &offsets, const int64_t mode = 0, + const c10::optional& per_sample_weights = c10::nullopt, + bool include_last_offset = false, + int64_t padding_idx = -1, + _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr); + +void _embedding_bag_cpu_out( + at::Tensor& output, + at::Tensor& offset2bag, + at::Tensor& bag_size, + at::Tensor* p_max_indices, + const at::Tensor& weight, + const at::Tensor& indices, + const at::Tensor& offsets, + const bool scale_grad_by_freq, + const int64_t mode, + const bool sparse, + const c10::optional& per_sample_weights, + const bool include_last_offset, + const c10::optional& padding_idx, + _EmbeddingBagKernelCache* fbgemm_kernel_cache = nullptr); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Fill.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Fill.h new file mode 100644 index 0000000000000000000000000000000000000000..f6de9580ae7c33340d2929c4c5f743e4aaf42339 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Fill.h @@ -0,0 +1,21 @@ +// Functions that fill Tensors with constants. Implementations are in Fill.cpp. + +#pragma once + +#include + +namespace c10 { +class Scalar; +} + +namespace at { +class Tensor; +struct TensorIterator; + +namespace native { + +DECLARE_DISPATCH(void(*)(TensorIterator&, const c10::Scalar&), fill_stub); + +Tensor& fill_out(Tensor& self, const Scalar& value); + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LossMulti.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LossMulti.h new file mode 100644 index 0000000000000000000000000000000000000000..f21269620f25345644a91dd59009f7e2c8c99121 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LossMulti.h @@ -0,0 +1,72 @@ +#pragma once +#include +#include +#include +#include + +namespace at::native { +namespace { + static C10_UNUSED void multilabel_margin_loss_shape_check( + int64_t& nframe, + int64_t& dim, + const int64_t& ndims, + const Tensor& input, + const Tensor& target) { + TORCH_CHECK( + (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0, + "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ", + input.sizes()); + + if (ndims <= 1) { + nframe = 1; + dim = ndims == 0 ? 1 : input.size(0); + TORCH_CHECK( + target.dim() <= 1 && target.numel() == dim, + "inconsistent target size: ", target.sizes(), " for input of size: ", + input.sizes()); + } else { + nframe = input.size(0); + dim = input.size(1); + TORCH_CHECK( + target.dim() == 2 && target.size(0) == nframe && + target.size(1) == dim, + "inconsistent target size: ", target.sizes(), " for input of size: ", + input.sizes()); + } + } + + static C10_UNUSED void multi_margin_loss_shape_check( + int64_t& nframe, + int64_t& dim, + const int64_t& ndims, + const Tensor& input, + const Tensor& target, + const c10::optional& weight) { + TORCH_CHECK( + (ndims == 2 && input.size(1) != 0) || (ndims == 1 && input.size(0) != 0) || ndims == 0, + "Expected non-empty vector or matrix with optional 0-dim batch size, but got: ", + input.sizes()); + + if (ndims <= 1) { + nframe = 1; + dim = ndims == 0 ? 1 : input.size(0); + } else { + nframe = input.size(0); + dim = input.size(1); + } + + TORCH_CHECK( + target.dim() <= 1 && target.numel() == nframe, + "inconsistent target size, expected ", nframe, " but got ", + target.sizes()); + if (weight && weight->defined()) { + TORCH_CHECK( + weight->dim() <= 1 && weight->numel() == dim, + "inconsistent weight size, expected ", dim, " but got ", + weight->sizes()); + } +} + + +} // anonymous namespace +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Normalization.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Normalization.h new file mode 100644 index 0000000000000000000000000000000000000000..6cd4dcde370522874311f43dbcdfca0e16bc5035 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Normalization.h @@ -0,0 +1,11 @@ +#pragma once + +#include +#include + +namespace at::native { + +using renorm_scale_factor_fn = void (*) (TensorIteratorBase& iter, double maxnorm); +DECLARE_DISPATCH(renorm_scale_factor_fn, renorm_scale_factor_stub); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Pow.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Pow.h new file mode 100644 index 0000000000000000000000000000000000000000..068482ee300c73d7bd185482f85d109e400e9cc8 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Pow.h @@ -0,0 +1,69 @@ +#pragma once + +#include + +namespace c10 { +class Scalar; +} + +namespace at { + +struct TensorIterator; +struct TensorIteratorBase; + +namespace native { + +#if defined(__CUDACC__) || defined(__HIPCC__) +#define HOST_DEVICE __host__ __device__ +#else +#define HOST_DEVICE +#endif + +// integral power in pytorch allows for negative exponents, giving truncated integral results. +// e.g. since 2**-1==0.5, the truncated integral result is zero. 1**negative_exponent is the +// only non-zero result. +template ::value, T>::type* = nullptr> +static inline HOST_DEVICE __ubsan_ignore_signed_int_overflow__ T powi_impl(T a, T b) { + T result = 1; + while (b) { + if (b & 1) { + result *= a; + } + b /= 2; + a *= a; + } + return result; +} + +template ::value && !std::is_signed::value, T>::type* = nullptr> +static inline HOST_DEVICE T powi(T a, T b) { + return powi_impl(a, b); +} + +template ::value && std::is_signed::value, T>::type* = nullptr> +static inline HOST_DEVICE T powi(T a, T b) { + if ( b < 0 ) { + if ( a == 1 ) { + return 1; + } else if ( a == -1 ) { + auto negative = (-b) % static_cast(2); + return negative ? -1 : 1; + } else { + return 0; + } + } + return powi_impl(a, b); +} + +using pow_tensor_tensor_fn = void (*)(TensorIteratorBase&); +using pow_tensor_scalar_fn = void (*)(TensorIteratorBase&, const c10::Scalar&); + +DECLARE_DISPATCH(pow_tensor_tensor_fn, pow_tensor_tensor_stub); +DECLARE_DISPATCH(pow_tensor_scalar_fn, pow_tensor_scalar_stub); + +} // namespace native + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceOps.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceOps.h new file mode 100644 index 0000000000000000000000000000000000000000..604d6ae8a74ef2ee617d11868859c90541efb3d3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/ReduceOps.h @@ -0,0 +1,56 @@ +#pragma once + +#include +#include +#include + +namespace c10 { +class Scalar; +} + +namespace at { +struct TensorIterator; +class Tensor; +} + +namespace at::native { + +using reduce_fn = void(*)(TensorIterator &); + +DECLARE_DISPATCH(reduce_fn, sum_stub); +DECLARE_DISPATCH(reduce_fn, nansum_stub); +DECLARE_DISPATCH(reduce_fn, prod_stub); +DECLARE_DISPATCH(reduce_fn, mean_stub); +DECLARE_DISPATCH(reduce_fn, and_stub); +DECLARE_DISPATCH(reduce_fn, or_stub); +DECLARE_DISPATCH(reduce_fn, min_values_stub); +DECLARE_DISPATCH(reduce_fn, max_values_stub); +DECLARE_DISPATCH(reduce_fn, argmax_stub); +DECLARE_DISPATCH(reduce_fn, argmin_stub); + +using reduce_std_var_function = + void (*)(TensorIterator&, double correction, bool take_sqrt); +DECLARE_DISPATCH(reduce_std_var_function, std_var_stub); + +using reduce_norm_fn = + void (*)(Tensor&, const Tensor&, const c10::Scalar&, c10::optional); +DECLARE_DISPATCH(reduce_norm_fn, norm_kernel); + +using reduce_fn_flag = void(*)(TensorIterator &, const c10::Scalar&); +DECLARE_DISPATCH(reduce_fn_flag, norm_stub); + +using structured_cum_fn = void (*)(const Tensor&, const Tensor&, int64_t); +using cum_fn = void (*)(Tensor&, const Tensor&, int64_t); +DECLARE_DISPATCH(structured_cum_fn, cumsum_stub); +DECLARE_DISPATCH(structured_cum_fn, cumprod_stub); +DECLARE_DISPATCH(cum_fn, logcumsumexp_stub); + +DECLARE_DISPATCH(void (*)(const Tensor&, int64_t, bool, Tensor&, Tensor&), aminmax_stub); +DECLARE_DISPATCH(void (*)(const Tensor&, Tensor&, Tensor&), aminmax_allreduce_stub); + +// Used in cuda/Normalization.cu +TORCH_API std::tuple var_mean_out( + Tensor &result1, Tensor &result2, const Tensor &self, IntArrayRef dim, + int64_t correction, bool keepdim); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..17e42ebe84a0e8b0906a76ba9c937c6c46027caa --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SobolEngineOpsUtils.h @@ -0,0 +1,55 @@ +/// This file contains some tensor-agnostic operations to be used in the +/// core functions of the `SobolEngine` +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#endif + +namespace at::native::sobol_utils { + +/// Function to return the minimum of number of bits to represent the integer `n` +inline int64_t bit_length(const int64_t n) { + int64_t nbits, nloc; + for (nloc = n, nbits = 0; nloc > 0; nloc /= 2, nbits++); + return nbits; +} + +/// Function to get the position of the rightmost zero in the bit representation of an integer +/// This value is the zero-indexed position +inline int64_t rightmost_zero(const int64_t n) { + int64_t z, i; + for (z = n, i = 0; z % 2 == 1; z /= 2, i++); + return i; +} + +/// Function to get a subsequence of bits in the representation of an integer starting from +/// `pos` and of length `length` +inline int64_t bitsubseq(const int64_t n, const int64_t pos, const int64_t length) { + return (n >> pos) & ((1 << length) - 1); +} + +/// Function to perform the inner product between a batched square matrix and a power of 2 vector +inline at::Tensor cdot_pow2(const at::Tensor& bmat) { + at::Tensor inter = at::arange(bmat.size(-1) - 1, -1, -1, bmat.options()); + inter = at::pow(2, inter).expand_as(bmat); + return at::mul(inter, bmat).sum(-1); +} + +/// All definitions below this point are data. These are constant, and should not be modified +/// without notice + +constexpr int64_t MAXDIM = 21201; +constexpr int64_t MAXDEG = 18; +constexpr int64_t MAXBIT = 30; +constexpr int64_t LARGEST_NUMBER = 1 << MAXBIT; +constexpr float RECIPD = 1.0 / LARGEST_NUMBER; + +extern const int64_t poly[MAXDIM]; +extern const int64_t initsobolstate[MAXDIM][MAXDEG]; + +} // namespace at::native::sobol_utils diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorCompare.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorCompare.h new file mode 100644 index 0000000000000000000000000000000000000000..b4dfa689b1d216cb697076781935afb81a587fae --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorCompare.h @@ -0,0 +1,49 @@ +#pragma once + +#include + +namespace c10 { +class Scalar; +} + +namespace at { +class Tensor; +struct TensorIterator; +struct TensorIteratorBase; +} + +namespace at::native { + +using reduce_minmax_fn = + void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool); +using structured_reduce_minmax_fn = + void (*)(const Tensor&, const Tensor&, const Tensor&, int64_t, bool); + +DECLARE_DISPATCH(structured_reduce_minmax_fn, max_stub); +DECLARE_DISPATCH(structured_reduce_minmax_fn, min_stub); + +using where_fn = void (*)(TensorIterator &); +DECLARE_DISPATCH(where_fn, where_kernel); + +using is_infinity_op_fn = void (*)(TensorIteratorBase &); +DECLARE_DISPATCH(is_infinity_op_fn, isposinf_stub); +DECLARE_DISPATCH(is_infinity_op_fn, isneginf_stub); + +using mode_fn = void (*)(Tensor&, Tensor&, const Tensor&, int64_t, bool); +DECLARE_DISPATCH(mode_fn, mode_stub); + +using clamp_tensor_fn = void (*)(TensorIteratorBase &); +DECLARE_DISPATCH(clamp_tensor_fn, clamp_stub); + +namespace detail { + enum class ClampLimits {Min, Max, MinMax}; +} + +DECLARE_DISPATCH(void (*)(TensorIteratorBase &, const c10::Scalar&, const c10::Scalar&), clamp_scalar_stub); +DECLARE_DISPATCH(void (*)(TensorIteratorBase &, c10::Scalar), clamp_min_scalar_stub); +DECLARE_DISPATCH(void (*)(TensorIteratorBase &, c10::Scalar), clamp_max_scalar_stub); + +using isin_default_fn = void (*)(const Tensor&, const Tensor&, bool, const Tensor&); +DECLARE_DISPATCH(isin_default_fn, isin_default_stub); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIterator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIterator.h new file mode 100644 index 0000000000000000000000000000000000000000..e55d2a58d709926a24467a0056323096e0890fa9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorIterator.h @@ -0,0 +1,2 @@ +#pragma once +#include diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..cc56fa6457e75bc980747afc9d2d72257d6c093b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TriangularOpsUtils.h @@ -0,0 +1,57 @@ +#include +#include + +namespace at::native { + +/* + * Given batches of matrices with arbitrary batch dim, + * computes the number of batches for Triu and Tril. This ignores stride 0 dimension + */ +static inline int64_t batchCountTrilTriu(const Tensor& batched_matrices) { + int64_t result = 1; + for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) { + if (batched_matrices.stride(i) != 0) { + result *= batched_matrices.size(i); + } + } + return result; +} + +/* Checks a necessary property for the triu and tril implementations, hence the name. + * Here batch contiguity is checked for tensors with greater than 4 dimensions. + * Contiguous tensors and tensors with less than 3 dimensions pass this check + */ +static inline std::tuple checkTrilTriuBatchContiguous(const Tensor& tensor, bool allow_zero_stride) { + // Complete contiguity is the most desired property, which is why + // we return true if the tensor is contiguous + if (tensor.is_contiguous()) { + auto default_strides_for_size = batched_matrix_contiguous_strides(tensor.sizes()); + if (tensor.strides() == default_strides_for_size) { + return std::make_tuple(true, tensor); + } else { + return std::make_tuple(false, tensor.as_strided(tensor.sizes(), default_strides_for_size)); + } + } + + int64_t dims = tensor.dim(); + + // Tensors with dimension less than 4 are handled by default + if (allow_zero_stride && dims <= 3) { + return std::make_tuple(true, tensor); + } + + int64_t expected_stride = tensor.size(-1) * tensor.size(-2); + for (int64_t i = dims - 3; i >= 0; i--) { + // Skip trivial dimension; + if (allow_zero_stride && i == 0 && (tensor.stride(i) == 0 || tensor.size(i) == 1)) { + continue; + } + if (expected_stride != tensor.stride(i)) { + return std::make_tuple(false, tensor.contiguous()); + } + expected_stride *= tensor.size(i); + } + return std::make_tuple(true, tensor); +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/IsContiguous.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/IsContiguous.h new file mode 100644 index 0000000000000000000000000000000000000000..192177cc9bcfb0988171fb68554ab56a7120ed4c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/IsContiguous.h @@ -0,0 +1,62 @@ +#pragma once + +namespace at { namespace native { inline namespace CPU_CAPABILITY { + +// n: number of function arguments (arity) +// traits: function_traits (see FunctionTraits.h) +// s: index of scalar argument or -1 +template +struct IsContiguous { + static bool eval(const int64_t* strides) { + using type = typename traits::template arg::type; + return strides[stride_index] == (s == n ? 0 : sizeof(type)) && + IsContiguous::eval(strides); + } +}; + +// will be called when there is an output exists +template +struct IsContiguous<0, 0, traits, s> { + static bool eval(const int64_t* strides) { + return strides[0] == sizeof(typename traits::result_type); + } +}; + +// will be called when there is no output +template +struct IsContiguous<0, -1, traits, s> { + static bool eval(const int64_t* /*strides*/) { + return true; + } +}; + +// output and all inputs are contiguous +template ::value>::type* = nullptr> +static inline bool is_contiguous(const int64_t* strides) { + return IsContiguous::eval(strides); +} + +template ::value>::type* = nullptr> +static inline bool is_contiguous(const int64_t* strides) { + return IsContiguous::eval(strides); +} + +// input at `s` is scalar (stride 0); output and other inputs are contiguous +// NB: output is typically at strides[0] so first input corresponds to s=1 +template ::value>::type* = nullptr> +static inline bool is_contiguous_scalar(const int64_t* strides) { + static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds"); + return IsContiguous::eval(strides); +} + +template ::value>::type* = nullptr> +static inline bool is_contiguous_scalar(const int64_t* strides) { + static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds"); + return IsContiguous::eval(strides); +} + +}}} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..ee9fac647ad6241c97e28a7af6f091d5d613bc3a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SoftmaxKernel.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include + +namespace at { +class Tensor; + +namespace native { + +using forward_fn = void (*)(const Tensor&, const Tensor&); +using backward_fn = void(*)(const Tensor &, const Tensor &, const Tensor&); + +DECLARE_DISPATCH(forward_fn, softmax_lastdim_kernel); +DECLARE_DISPATCH(forward_fn, log_softmax_lastdim_kernel); +DECLARE_DISPATCH(backward_fn, softmax_backward_lastdim_kernel); +DECLARE_DISPATCH(backward_fn, log_softmax_backward_lastdim_kernel); + +using forward_fn_with_dim = void(*)(const Tensor &, const Tensor &, const int64_t); +using backward_fn_with_dim = + void (*)(const Tensor&, const Tensor&, const Tensor&, const int64_t); + +DECLARE_DISPATCH(forward_fn_with_dim, softmax_kernel); +DECLARE_DISPATCH(forward_fn_with_dim, log_softmax_kernel); +DECLARE_DISPATCH(backward_fn_with_dim, softmax_backward_kernel); +DECLARE_DISPATCH(backward_fn_with_dim, log_softmax_backward_kernel); +} +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh new file mode 100644 index 0000000000000000000000000000000000000000..e764cc4ce803905a88363f00291a2066d76bb274 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/CUDAJitLoops.cuh @@ -0,0 +1,296 @@ +#pragma once +#include + +// Jiterator functions are guarded behind this macro +#if AT_USE_JITERATOR() + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include + +namespace at { +namespace native { + +template +constexpr auto tuple_to_array_helper(Tuple& t, std::index_sequence seq) { + constexpr auto size = seq.size(); + (void)t; // warning : unused parameter when tuple is empty. + return std::array{static_cast(&std::get(t))...}; +} + +// Helper function convert tuple to std::array +// for passing the arguments to CUDA Kernel +// NOTE: We capture tuple by reference, +// so the pointers in returned array are only valid +// till tuple is alive. +template +constexpr auto tuple_to_array(std::tuple& extra_args) { + constexpr auto tuple_size = sizeof...(Args); + return tuple_to_array_helper(extra_args, std::make_index_sequence{}); +} + +struct JittedVecKernelCache { + // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements) + at::cuda::jit::NvrtcFunction vec1; + at::cuda::jit::NvrtcFunction vec2; + at::cuda::jit::NvrtcFunction vec4; +}; + +struct JittedKernelVariantCache { + JittedVecKernelCache vec; + at::cuda::jit::NvrtcFunction noncontiguous; + at::cuda::jit::NvrtcFunction dynamic_contiguous; + at::cuda::jit::NvrtcFunction dynamic_noncontiguous; +}; + +inline c10::SmallBuffer pack_kernel_args( + std::initializer_list args, + c10::ArrayRef extra_args) { + c10::SmallBuffer ret(args.size() + extra_args.size()); + std::copy(args.begin(), args.end(), ret.data()); + std::copy(extra_args.begin(), extra_args.end(), ret.data() + args.size()); + return ret; +} + +template +void launch_jitted_unrolled_kernel( + std::mutex &jiterator_mutex, + at::cuda::jit::NvrtcFunction &fn_cache, + const at::cuda::jit::KernelDescriptor &desc, + int64_t N, + array_t data, + inp_calc_t ic, + out_calc_t oc, + loader_t l, + storer_t s, + bool contiguous, + at::cuda::jit::BinaryFuncVariant scalar_pos, + void* scalar_val, + c10::ArrayRef extra_args) { + + TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); + //casting result to int is always safe, intermediate is int64 and won't overflow + const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); + + if (!fn_cache.function) { + const std::lock_guard lock{jiterator_mutex}; + if (!fn_cache.function) { + constexpr bool dynamic_casting = !std::is_same() || + !std::is_same(); + auto code = at::cuda::jit::generate_code( + desc, contiguous, dynamic_casting, scalar_pos); + fn_cache = at::cuda::jit::jit_pwise_function(code, desc.name); + } + } + + auto args = pack_kernel_args({&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args); + at::cuda::jit::launch_jitted_pwise_function(fn_cache, args.data(), {grid, 1u, 1u}, + {num_threads(), 1u, 1u}); +} + +template +void launch_jitted_vectorized_kernel( + std::mutex &jiterator_mutex, JittedVecKernelCache &fn_cache, + const at::cuda::jit::KernelDescriptor &desc, int64_t N, array_t data, + at::cuda::jit::BinaryFuncVariant scalar_pos, + void *scalar_val, c10::ArrayRef extra_args) { + TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits::max()); + // N is still int64_t for the computation, but it's always safe to cast result to int + const uint32_t grid = (N + block_work_size() - 1) / block_work_size(); + const int vec_size = at::cuda::jit::can_vectorize_up_to( + desc, c10::ArrayRef(data.data, data.size())); + + // Different kernels are compiled depending on what we're vectorizing up to (1, 2 or 4 elements) + // fn_ptr is set to the appropriate function based on the vec size and GPU used + at::cuda::jit::NvrtcFunction* fn_ptr; + if (vec_size == 4) { + fn_ptr = &fn_cache.vec4; + } else if (vec_size == 2) { + fn_ptr = &fn_cache.vec2; + } else if (vec_size ==1) { + fn_ptr = &fn_cache.vec1; + } else { + TORCH_INTERNAL_ASSERT(false, "unexpected vec_size for jitter vectorized kernel"); + } + + bool vectorized = vec_size > 1; + + if (!fn_ptr->function) { + const std::lock_guard lock{jiterator_mutex}; + if (!fn_ptr->function) { // cache miss! + + // Generates program + auto code = at::cuda::jit::generate_code( + desc, /*contiguous=*/true, /*dynamic_casting=*/false, + scalar_pos, vectorized, vec_size); + std::string kernel_name = vectorized ? desc.name + "_vectorized" + std::to_string(vec_size) : desc.name; + + // Acquires the program + *fn_ptr = at::cuda::jit::jit_pwise_function(code, kernel_name); + } + } + + if (vectorized) { + auto args = pack_kernel_args({&N, &data, scalar_val}, extra_args); + at::cuda::jit::launch_jitted_pwise_function( + *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u}); + } else { +// NVCC complains about unused variables l and s. +// It should be false positive in most cases, so we suppress the warnings. +#pragma nv_diagnostic push +#pragma nv_diag_suppress 177 + auto ic = TrivialOffsetCalculator(); + auto oc = TrivialOffsetCalculator<1>(); + auto l = memory::LoadWithoutCast(); + auto s = memory::StoreWithoutCast(); + + auto args = pack_kernel_args( + {&N, &data, &ic, &oc, &l, &s, scalar_val}, extra_args); + at::cuda::jit::launch_jitted_pwise_function( + *fn_ptr, args.data(), {grid, 1u, 1u}, {num_threads(), 1u, 1u}); +#pragma nv_diagnostic pop + } +} + +template +void jitted_gpu_kernel_generic( + std::mutex &jiterator_mutex, + JittedKernelVariantCache &cache, + const at::cuda::jit::KernelDescriptor &desc, + at::cuda::jit::BinaryFuncVariant scalar_pos, + c10::ArrayRef extra_args, + TensorIteratorBase& iter, + const bool dynamic_casting, + void *scalar_val) { + TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing()); + TORCH_INTERNAL_ASSERT(iter.ninputs() == arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + + constexpr int ntensors = arity + 1; + at::detail::Array data; + for (auto i : c10::irange(ntensors)) { + data[i] = (char*)iter.data_ptr(i); + } + + int64_t numel = iter.numel(); + bool contiguous = iter.is_contiguous(); + + // Decides which of 4 kernel types to launch + // Variations are: + // - Case 1: no dynamic casting and contiguous + // - Case 2: no dynamic casting and noncontiguous + // - Case 3: dynamic casting and contiguous + // - Case 4: dynamic casting and noncontiguous + // These cases align with the non-jitted CUDALoops.cuh cases in gpu_kernel_impl + + if (!dynamic_casting) { + if (contiguous) { + // Case 1: no dynamic casting and contiguous + launch_jitted_vectorized_kernel( + jiterator_mutex, cache.vec, desc, + numel, data, scalar_pos, scalar_val, extra_args); + return; + } + + // Case 2: no dynamic casting and noncontiguous + auto input_offset_calculator = make_input_offset_calculator(iter); + auto output_offset_calculator = make_output_offset_calculator(iter); + auto loader = memory::LoadWithoutCast(); + auto storer = memory::StoreWithoutCast(); + launch_jitted_unrolled_kernel( + jiterator_mutex, cache.noncontiguous, desc, numel, data, + input_offset_calculator, output_offset_calculator, loader, + storer, contiguous, scalar_pos, scalar_val, extra_args); + return; + } + + // Cases 3 and 4 are handled below + // Both require construction of a storer (this asserts 1 output) and one or more loaders + + // Creates store cast to output (the zeroth tensor in TensorIterator) + auto storer = memory::StoreWithCast<1>(iter); + + // Creates load casts from inputs (note offset indexing into the iterators 1...n tensors) + auto loader = memory::LoadWithCast(iter); + + if (contiguous) { + // Case 3: dynamic casting and contiguous + auto input_offset_calculator = TrivialOffsetCalculator(); + auto output_offset_calculator = TrivialOffsetCalculator<1>(); + launch_jitted_unrolled_kernel( + jiterator_mutex, cache.dynamic_contiguous, desc, numel, data, input_offset_calculator, + output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args); + return; + } + + // Case 4: dynamic casting and noncontiguous + auto input_offset_calculator = make_input_offset_calculator(iter); + auto output_offset_calculator = make_output_offset_calculator(iter); + launch_jitted_unrolled_kernel( + jiterator_mutex, cache.dynamic_noncontiguous, desc, numel, data, input_offset_calculator, + output_offset_calculator, loader, storer, contiguous, scalar_pos, scalar_val, extra_args); +} + +// NOTE: static to reduce chances of name collision. +template < + char const* name, + typename result_type, + typename f_inputs_type, + int arity, + at::cuda::jit::BinaryFuncVariant scalar_pos = + at::cuda::jit::BinaryFuncVariant::NoScalar, + typename... ExtraArgs> +static void jitted_gpu_kernel_impl( + TensorIteratorBase& iter, + const std::string &f, + const bool dynamic_casting, + at::opmath_type scalar_val, + std::tuple extra_args) { + + // TODO: Memory use can probably be optimized by re-using kernels across GPUs with + // the same compute capability + static std::mutex jiterator_mutex; + static std::vector device_caches(c10::cuda::device_count()); + + constexpr int nInputs = arity; + constexpr int nOutputs = 1; // TODO: Support more than 1 output + static const auto desc = at::cuda::jit::make_kernel_descriptor< + result_type, f_inputs_type, ExtraArgs...>(name, f, nInputs, nOutputs); + + auto &cache = device_caches[iter.device().index()]; + auto extra_args_array = tuple_to_array(extra_args); + return jitted_gpu_kernel_generic( + jiterator_mutex, + cache, + desc, + scalar_pos, + extra_args_array, + iter, + dynamic_casting, + &scalar_val + ); +} + +}} // at::native + +#endif // AT_USE_JITERATOR() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h new file mode 100644 index 0000000000000000000000000000000000000000..aace9c30b0a7e9d08de71c4baf1490d45ff6d36e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/GridSampler.h @@ -0,0 +1,32 @@ +#pragma once +#include +#include + +namespace at { +class TensorBase; +} + +namespace at { +namespace native { + +void launch_grid_sampler_2d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners); + +void launch_grid_sampler_3d_forward_kernel( + const TensorBase &output, const TensorBase &input, const TensorBase &grid, + int64_t interpolation_mode, int64_t padding_mode, bool align_corners); + +void launch_grid_sampler_2d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase &grad_output, const TensorBase &input, + const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask); + +void launch_grid_sampler_3d_backward_kernel( + const TensorBase &grad_input, const TensorBase &grad_grid, + const TensorBase &grad_output, const TensorBase &input, + const TensorBase &grid, int64_t interpolation_mode, int64_t padding_mode, + bool align_corners, std::array output_mask); + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh new file mode 100644 index 0000000000000000000000000000000000000000..f0a620e27fcd5d55520fa672beb34faf8eaa5510 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MemoryAccess.cuh @@ -0,0 +1,384 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// References: +// https://devblogs.nvidia.com/cuda-pro-tip-increase-performance-with-vectorized-memory-access/ + +namespace at { namespace native { namespace memory { + +namespace detail { + +// What does the `static_unroll` do? +// +// We want to do something like: +// +// using args_t = typename traits::ArgsTuple; +// args_t args; +// #pragma unroll +// for (int i = 0; i < traits::arity; i++) { +// std::get(args) = .... +// } +// +// but unfortunately the above code does not work because +// the template argument has to be a compile time constant +// so `static_unroll` is created to simulate `#pragma unroll` +// using template metaprogramming. + +template typename func, int end, int current=0> +struct static_unroll { + template + static inline C10_HOST_DEVICE void with_args(Args&&... args) { + func::apply(std::forward(args)...); + static_unroll::with_args(args...); + } +}; + +template typename func, int end> +struct static_unroll { + template + static inline C10_HOST_DEVICE void with_args(Args... args) {} +}; + +// helper structs to be used with static_unroll to load arguments +// one by one + +template +struct vectorized_load_helper { + template + static __device__ void apply(policy_t &self, args_t *args, int idx) { + using arg_t = std::tuple_element_t; + // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we + // need a +1 offset to get the input + auto ptr = reinterpret_cast(self.data[arg_index + 1]) + block_work_size() * idx; + auto args_accessor = [&args] __device__ (int thread_unroll_idx) -> arg_t & { return std::get(args[thread_unroll_idx]); }; + self.load_single_arg(args_accessor, ptr); + } +}; + +template +struct unroll_load_helper { + template + static __device__ void apply(policy_t &self, args_t *args, offset_t offset, loader_t loader, int j, int num_outputs) { + using arg_t = std::tuple_element_t; + // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we + // need a +1 offset to get the input + std::get(args[j]) = loader.template load(self.data[arg_index + num_outputs], offset[arg_index], arg_index); + } +}; + +template +struct multi_outputs_store_helper { + template + C10_HOST_DEVICE static void apply( + at::detail::Array data, + at::detail::Array offsets, + thrust::tuple ret) { + using T = typename thrust::tuple_element>::type; + T *to = reinterpret_cast(data[current]) + offsets[current]; + *to = thrust::get(ret); + } +}; + +} // namespace detail + +struct LoadWithoutCast { + template + __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) { + return c10::load(reinterpret_cast(base_ptr) + offset); + } +}; + +template +struct LoadWithCast { + using array_t = at::detail::Array(N, 1)>; + using size_array_t = at::detail::Array(N, 1)>; + + array_t dtypes; + size_array_t element_sizes; + + LoadWithCast(const TensorIteratorBase& iter) { + CUDA_KERNEL_ASSERT(iter.ninputs() == N); + #pragma unroll + for (auto i = 0; i < N; ++i) { + this->dtypes[i] = iter.dtype(i + iter.noutputs()); + element_sizes[i] = c10::elementSize(iter.dtype(i + iter.noutputs())); + } + } + + template + __device__ scalar_t load(char *base_ptr, uint32_t offset, int arg) { + void *ptr = base_ptr + element_sizes[arg] * offset; + return c10::fetch_and_cast(dtypes[arg], ptr); + } +}; + +struct StoreWithoutCast { + template + __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) { + *(reinterpret_cast(base_ptr) + offset) = value; + } +}; + +template +struct StoreWithCast { + using array_t = at::detail::Array(N, 1)>; + using size_array_t = at::detail::Array(N, 1)>; + + array_t dtypes; + size_array_t element_sizes; + + StoreWithCast(const TensorIteratorBase& iter) { + CUDA_KERNEL_ASSERT(iter.noutputs() == N); + #pragma unroll + for (auto i = 0; i < N; ++i) { + this->dtypes[i] = iter.dtype(i); + element_sizes[i] = c10::elementSize(iter.dtype(i)); + } + } + + template + __device__ void store(scalar_t value, char *base_ptr, uint32_t offset, int arg = 0) { + void *ptr = base_ptr + element_sizes[arg] * offset; + c10::cast_and_store(dtypes[arg], ptr, value); + } +}; + +// aligned vector generates vectorized load/store on CUDA +template +struct alignas(sizeof(scalar_t) * vec_size) aligned_vector { + scalar_t val[vec_size]; +}; + +template +__device__ aligned_vector load_vector(const scalar_t *base_ptr, uint32_t offset) { + using vec_t = aligned_vector; + auto *from = reinterpret_cast(base_ptr); + return from[offset]; +} + +template +__device__ aligned_vector load_vector(const bool *base_ptr, uint32_t offset) { + // See NOTE [Loading boolean values] + auto tmp = load_vector(reinterpret_cast(base_ptr), offset); + aligned_vector ret; + for (int i = 0; i < vec_size; ++i) { + ret.val[i] = bool(tmp.val[i]); + } + return ret; +} + +namespace policies { + +// Assumption: +// all tensors are contiguous, that is: stride == sizeof(type) for all tensors +template +struct unroll { + + data_t data; + int remaining; + inp_calc_t input_offset_calculator; + out_calc_t output_offset_calculator; + loader_t loader; + storer_t storer; + + __device__ unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s): + data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {} + + __device__ inline bool check_inbounds(int thread_work_elem) { + return ((int)(threadIdx.x + thread_work_elem*num_threads()) < remaining); + } + + template + __device__ inline void load(args_t *args, int idx) { + constexpr int arity = std::tuple_size::value; + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= remaining) { + return; + } + int linear_idx = thread_idx + block_work_size() * idx; + auto offset = input_offset_calculator.get(linear_idx); + detail::static_unroll::with_args(*this, args, offset, loader, i, num_outputs); + thread_idx += num_threads(); + } + } + + template + __device__ inline void store(scalar_t *from, int idx) { + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= remaining) { + return; + } + int linear_idx = thread_idx + block_work_size() * idx; + int offset = output_offset_calculator.get(linear_idx)[0]; + storer.store(from[i], data[0], offset); + thread_idx += num_threads(); + } + } +}; + +// Assumption: +// all tensors are contiguous, that is: stride == sizeof(type) for all tensors +// Note: +// Functions in vectorized policy does not do boundary check. It assumes the whole block +// has its job to do. So the reminders should be handled by the caller manually. +template // vec_size: number of scalars, can be 1, 2, or 4. +struct vectorized { + + static_assert(thread_work_size() % vec_size == 0, "The workload per thread must be a multiple of vec_size"); + static constexpr int loop_size = thread_work_size() / vec_size; + + data_t data; + + __device__ vectorized(data_t data) : data(data) {} + + __device__ inline constexpr bool check_inbounds(int thread_work_elem) { + return true; + } + + template + __device__ inline void load_single_arg(accessor_t to, scalar_t *from) { + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < loop_size; i++) { + int index = thread_idx + i * num_threads(); + auto v = load_vector(from, index); + #pragma unroll + for (int j = 0; j < vec_size; j++) { + to(vec_size * i + j) = v.val[j]; + } + } + } + + template + __device__ inline void load(args_t *args, int idx) { + constexpr int arity = std::tuple_size::value; + detail::static_unroll::with_args(*this, args, idx); + } + + template + __device__ inline void store(scalar_t *from, int idx) { + using vec_t = aligned_vector; + scalar_t *to = reinterpret_cast(data[0]) + block_work_size() * idx; + vec_t *to_ = reinterpret_cast(to); + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < loop_size; i++) { + int index = thread_idx + i * num_threads(); + vec_t v; + for (int j = 0; j < vec_size; j++) { + v.val[j] = from[vec_size * i + j]; + } + to_[index] = v; + } + } +}; + +template +struct multi_outputs_unroll { + //multi_outputs_unroll struct members and check_inbounds and load methods are copypasted from unroll struct + //we don't use inheritance because of compiler bug in cuda 10.2+ + data_t data; + int remaining; + inp_calc_t input_offset_calculator; + out_calc_t output_offset_calculator; + LoadWithoutCast loader; + StoreWithoutCast storer; + + __device__ multi_outputs_unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc): + data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {} + + __device__ inline bool check_inbounds(int thread_work_elem) { + return ((int)(threadIdx.x + thread_work_elem*num_threads()) < remaining); + } + + template + __device__ inline void load(args_t *args, int idx) { + constexpr int arity = std::tuple_size::value; + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= remaining) { + return; + } + int linear_idx = thread_idx + block_work_size() * idx; + auto offset = input_offset_calculator.get(linear_idx); + detail::static_unroll::with_args(*this, args, offset, loader, i, num_outputs); + thread_idx += num_threads(); + } + } + + + template + __device__ inline void store(return_t *from, int idx) { + int thread_idx = threadIdx.x; + #pragma unroll + for (int i = 0; i < thread_work_size(); i++) { + if (thread_idx >= this->remaining) { + return; + } + int linear_idx = thread_idx + block_work_size() * idx; + auto offsets = this->output_offset_calculator.get(linear_idx); + memory::detail::static_unroll::with_args(this->data, offsets, from[i]); + thread_idx += num_threads(); + } + } +}; + +} // namespace policies + +// This is only used in host, but we will wrap this into some templates +// which is C10_HOST_DEVICE, so we have to make this C10_HOST_DEVICE +// in order to compile +template +inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) { + uint64_t address = reinterpret_cast(pointer); + constexpr int vec2_alignment = std::alignment_of>::value; + constexpr int vec4_alignment = std::alignment_of>::value; + if (address % vec4_alignment == 0) { + return 4; + } else if (address % vec2_alignment == 0) { + return 2; + } + return 1; +} + +template +struct can_vectorize_up_to_helper { + template + static C10_HOST_DEVICE void apply(int &result, array_t pointers, traits _) { + using arg_t = typename traits::template arg::type; + // `pointers` hold the data_ptr for tensors [output, input0, input1, ...], so we + // need a +1 offset to get the input + result = std::min(result, can_vectorize_up_to(pointers[i + 1])); + } +}; + +template +inline int can_vectorize_up_to(array_t pointers) { + using traits = function_traits; + using return_t = typename traits::result_type; + constexpr int arity = traits::arity; + int result = can_vectorize_up_to(pointers[0]); + // We need to get the type for each argument of `func_t`, this can only + // be done at compile time. + detail::static_unroll::with_args(result, pointers, traits()); + return result; +} + +}}} // namespace at::native::memory diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh new file mode 100644 index 0000000000000000000000000000000000000000..17f14444abd14a03de30f57d3be7254f51a957f9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/MultiTensorApply.cuh @@ -0,0 +1,379 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +namespace at::native { + +namespace { + +static constexpr int64_t kILP = 4; +static constexpr int64_t kChunkSize = 65536; +static constexpr int64_t kBlockSize = 512; + +// TODO(crcrpar): Add `n>5` for `low prec params & their higher prec copy` +// TensorListMetadata has to be < 4KB - the limit for kernel launch argument +static constexpr int depth_to_max_tensors[5] = {110, 64, 48, 36, 30}; +static constexpr int depth_to_max_blocks[5] = {320, 320, 320, 320, 320}; +static constexpr int depth_to_max_tensors_scalarlist[5] = {96, 64, 48, 36, 30}; +static constexpr int depth_to_max_tensors_scalarlist_of_complex_double[2] = { + 72, + 60}; + +template +__device__ __forceinline__ bool is_aligned(T* p) { + return ((uint64_t)p) % (kILP * sizeof(T)) == 0; +} + +template +__device__ __forceinline__ void load_store( + T* dst, + T* src, + int64_t dst_offset, + int64_t src_offset) { + using LT = at::native::memory::aligned_vector; + ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset]; +} + +template +struct TensorListMetadata { + const void* addresses[n][depth_to_max_tensors[n - 1]]; + int64_t numel_for_tensor[depth_to_max_tensors[n - 1]]; + unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; + int block_to_chunk[depth_to_max_blocks[n - 1]]; + int start_tensor_this_launch; +}; + +template +struct TensorListScalarListMetadata { + const void* addresses[n][depth_to_max_tensors_scalarlist[n - 1]]; + int64_t numel_for_tensor[depth_to_max_tensors_scalarlist[n - 1]]; + scalar_vals_t scalar_vals[depth_to_max_tensors_scalarlist[n - 1]]; + unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; + int block_to_chunk[depth_to_max_blocks[n - 1]]; +}; + +// note(mkozuki): `n` of 1&2 violate the limit of cuda kernel argument size of +// 4kb with `c10::complex` +template <> +struct TensorListScalarListMetadata, 1> { + const void* addresses[1] + [depth_to_max_tensors_scalarlist_of_complex_double[0]]; + int64_t + numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[0]]; + c10::complex + scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[0]]; + unsigned char block_to_tensor[depth_to_max_blocks[1 - 1]]; + int block_to_chunk[depth_to_max_blocks[1 - 1]]; +}; + +template <> +struct TensorListScalarListMetadata, 2> { + const void* addresses[2] + [depth_to_max_tensors_scalarlist_of_complex_double[1]]; + int64_t + numel_for_tensor[depth_to_max_tensors_scalarlist_of_complex_double[1]]; + c10::complex + scalar_vals[depth_to_max_tensors_scalarlist_of_complex_double[1]]; + unsigned char block_to_tensor[depth_to_max_blocks[2 - 1]]; + int block_to_chunk[depth_to_max_blocks[2 - 1]]; +}; + +// NOTE(crcrpar): This is a conservative resolution to handle `state_steps` +// whose each element is `at::Tensor` of 1 element representing the number of +// `step`s called so far. +template +struct FusedOptimizerTensorListMetadata { + const void* addresses[n][depth_to_max_tensors[n - 1]]; + int64_t numel_for_tensor[depth_to_max_tensors[n - 1]]; + const void* state_steps_addresses[depth_to_max_tensors_scalarlist[n - 1]]; + unsigned char block_to_tensor[depth_to_max_blocks[n - 1]]; + int block_to_chunk[depth_to_max_blocks[n - 1]]; + int start_tensor_this_launch; +}; + +template +C10_LAUNCH_BOUNDS_1(kBlockSize) +__global__ void multi_tensor_apply_kernel( + T tensorListMeta, + U callable, + ArgTypes... args) { + // Hand the chunk information to the user-supplied functor to process however + // it likes. + callable(kChunkSize, tensorListMeta, args...); +} + +} // namespace + +// multi_tensor_apply enables horizontal fusion across lists of tensors. +// For example, whereas you once had a for-loop of a + b = c, where a, b, +// and c are individual tensors in lists as, bs, and cs, you can now with +// fewer kernel launches compute as + bs = cs. +// +// You can also imagine bs to be a scalar list vs a tensor list. +// +// The function below takes in tensor lists, scalars, and a callable and +// chunks up the computation to launch as few kernels as possible by iterating +// through every "chunk" in every tensor (thus the nested for loops). In the +// simplest case, everything gets bundled into just one kernel launch, but +// due to blocksize constraints, we may need to launch multiple kernels. +// Each kernel launch is defined by one tensorListMeta construct, which we +// use to track and reset the necessary metadata for each launch. +template +void multi_tensor_apply( + std::vector>& tensor_lists, + at::ArrayRef scalars, + T callable, + ArgTypes... args) { + TORCH_CHECK( + tensor_lists.size() == depth, + "Number of tensor lists has to match the depth."); + const size_t n_tensors = tensor_lists[0].size(); + using scalar_vals_t = typename T::opmath_t; + TensorListScalarListMetadata tensorListMeta; + + int loc_block_info = 0; + int loc_tensor_info = 0; + for (size_t t = 0; t < n_tensors; t++) { + // short-circuit to avoid adding empty tensors to tensorListMeta + if (tensor_lists[0][t].numel() == 0) { + continue; + } + tensorListMeta.scalar_vals[loc_tensor_info] = scalars[t].to(); + tensorListMeta.numel_for_tensor[loc_tensor_info] = + tensor_lists[0][t].numel(); + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][loc_tensor_info] = + tensor_lists[d][t].const_data_ptr(); + } + loc_tensor_info++; + + // now we enter [chunking territory]. + // we will launch a kernel when EITHER the blocks get filled up OR + // the tensors get filled up. There will always be at least one block + // per tensor since the zero-sized ones will not enter the loop, so + // the nested forloop within represents iterating through the chunks + // of a single tensor. + const auto numel = tensor_lists[0][t].numel(); + const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0); + for (auto chunk = 0; chunk < chunks; chunk++) { + tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1; + tensorListMeta.block_to_chunk[loc_block_info] = chunk; + loc_block_info++; + + // a tensor is not considered full unless all its chunks have been + // processed + const bool tensors_full = + (loc_tensor_info == depth_to_max_tensors_scalarlist[depth - 1] && + chunk == chunks - 1); + const bool blocks_full = + (loc_block_info == depth_to_max_blocks[depth - 1]); + + if (tensors_full || blocks_full) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>( + tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + // Reset. + loc_block_info = 0; + // all chunks have already been handled in the kernel + if (chunk == chunks - 1) { + loc_tensor_info = 0; + } else { // blocks were full and tensor chunks remain + tensorListMeta.numel_for_tensor[0] = + tensorListMeta.numel_for_tensor[loc_tensor_info - 1]; + tensorListMeta.scalar_vals[0] = + tensorListMeta.scalar_vals[loc_tensor_info - 1]; + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][0] = + tensorListMeta.addresses[d][loc_tensor_info - 1]; + } + loc_tensor_info = 1; + } + } + } + } + + // note: [finishing what we started] + // if there's remaining work to be done but the tensors/blocks aren't full + // yet we are at the end, submit the kernel to do the work! + if (loc_block_info != 0) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +template +void multi_tensor_apply( + std::vector>& tensor_lists, + T callable, + ArgTypes... args) { + TORCH_CHECK( + tensor_lists.size() == depth, + "Number of tensor lists has to match the depth."); + const size_t n_tensors = tensor_lists[0].size(); + TensorListMetadata tensorListMeta; + tensorListMeta.start_tensor_this_launch = 0; + + int loc_block_info = 0; + int loc_tensor_info = 0; + for (size_t t = 0; t < n_tensors; t++) { + // short-circuit to avoid adding empty tensors to tensorListMeta + if (tensor_lists[0][t].numel() == 0) { + continue; + } + tensorListMeta.numel_for_tensor[loc_tensor_info] = + tensor_lists[0][t].numel(); + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][loc_tensor_info] = + tensor_lists[d][t].const_data_ptr(); + } + loc_tensor_info++; + + // see note: [chunking territory]. + const auto numel = tensor_lists[0][t].numel(); + const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0); + for (auto chunk = 0; chunk < chunks; chunk++) { + tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1; + tensorListMeta.block_to_chunk[loc_block_info] = chunk; + loc_block_info++; + + const bool tensors_full = + (loc_tensor_info == depth_to_max_tensors[depth - 1] && + chunk == chunks - 1); + const bool blocks_full = + (loc_block_info == depth_to_max_blocks[depth - 1]); + + if (tensors_full || blocks_full) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>( + tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + // Reset. + loc_block_info = 0; + if (chunk == chunks - 1) { + loc_tensor_info = 0; + tensorListMeta.start_tensor_this_launch = t + 1; + } else { + tensorListMeta.numel_for_tensor[0] = + tensorListMeta.numel_for_tensor[loc_tensor_info - 1]; + for (int d = 0; d < depth; d++) { + tensorListMeta.addresses[d][0] = + tensorListMeta.addresses[d][loc_tensor_info - 1]; + } + loc_tensor_info = 1; + tensorListMeta.start_tensor_this_launch = t; + } + } + } + } + + // see note: [finishing what we started] + if (loc_block_info != 0) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +template +void multi_tensor_apply_for_fused_optimizer( + std::vector>& tensor_lists, + at::TensorList state_steps, + T callable, + ArgTypes... args) { + TORCH_CHECK( + tensor_lists.size() == depth, + "Number of tensor lists has to match the depth"); + const auto num_tensors = tensor_lists[0].size(); + FusedOptimizerTensorListMetadata tensorListMeta; + + int loc_block_info = 0; + int loc_tensor_info = 0; + for (const auto& tensor_index : c10::irange(num_tensors)) { + // short-circuit to avoid adding empty tensors to tensorListMeta + if (tensor_lists[0][tensor_index].numel() == 0) { + continue; + } + tensorListMeta.state_steps_addresses[loc_tensor_info] = + state_steps[tensor_index].const_data_ptr(); + tensorListMeta.numel_for_tensor[loc_tensor_info] = + tensor_lists[0][tensor_index].numel(); + for (const auto& d : c10::irange(depth)) { + tensorListMeta.addresses[d][loc_tensor_info] = + tensor_lists[d][tensor_index].const_data_ptr(); + } + loc_tensor_info++; + + // see above note: [chunking territory] + const auto numel = tensor_lists[0][tensor_index].numel(); + const auto chunks = numel / kChunkSize + (numel % kChunkSize != 0); + TORCH_CHECK(chunks > -1); + for (const auto& chunk : c10::irange(chunks)) { + tensorListMeta.block_to_tensor[loc_block_info] = loc_tensor_info - 1; + tensorListMeta.block_to_chunk[loc_block_info] = chunk; + loc_block_info++; + + const auto tensor_full = + (loc_tensor_info == depth_to_max_tensors[depth - 1] && + chunk == chunks - 1); + const auto blocks_full = loc_block_info == depth_to_max_blocks[depth - 1]; + + if (tensor_full || blocks_full) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>( + tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + + // Reset. + loc_block_info = 0; + if (chunk == chunks - 1) { + loc_tensor_info = 0; + } else { + tensorListMeta.numel_for_tensor[0] = + tensorListMeta.numel_for_tensor[loc_tensor_info - 1]; + tensorListMeta.state_steps_addresses[0] = + tensorListMeta.state_steps_addresses[loc_tensor_info - 1]; + for (const auto& d : c10::irange(depth)) { + tensorListMeta.addresses[d][0] = + tensorListMeta.addresses[d][loc_tensor_info - 1]; + } + loc_tensor_info = 1; + } + } + } + } + + // see above note: [finishing what we've started] + if (loc_block_info != 0) { + multi_tensor_apply_kernel<<< + loc_block_info, + kBlockSize, + 0, + at::cuda::getCurrentCUDAStream()>>>(tensorListMeta, callable, args...); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Resize.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Resize.h new file mode 100644 index 0000000000000000000000000000000000000000..569b145fa61d991472f589a777b2f74b4a277857 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Resize.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include + +#include + +namespace at { namespace native { + +TORCH_CUDA_CPP_API void resize_bytes_cuda(StorageImpl* storage, size_t size_bytes); + +static inline void maybe_resize_storage_cuda(TensorImpl* self, size_t new_size_bytes) { + // It does not make sense to try to resize a storage + // to hold 0 elements, and this can break + // if storage_offset is positive but + // new_size is 0, so just bail in that case + // (same comment is in Resize.h) + if (self->numel() == 0) { + return; + } + + const Storage &storage = self->unsafe_storage(); + TORCH_CHECK(storage, "Tensor: invalid null storage"); + if (new_size_bytes > storage.nbytes()) { + resize_bytes_cuda(storage.unsafeGetStorageImpl(), new_size_bytes); + } +} + +inline TensorImpl* resize_impl_cuda_( + TensorImpl* self, + IntArrayRef size, + at::OptionalIntArrayRef stride, + bool device_guard = true) { + if (self->sizes() == size && (!stride || self->strides() == stride)) { + return self; + } + + // NB: We don't need to hold the device guard when calling from TH + cuda::OptionalCUDAGuard guard; + if (device_guard) { + guard.set_index(self->storage().device().index()); + } + + const auto itemsize = self->dtype().itemsize(); + const auto storage_offset = self->storage_offset(); + size_t storage_size = 1; + if (stride) { + self->set_sizes_and_strides(size, *stride); + storage_size = at::detail::computeStorageNbytes( + size, *stride, itemsize, storage_offset); + } else { + self->set_sizes_contiguous(size); + storage_size = at::detail::computeStorageNbytesContiguous( + size, itemsize, storage_offset); + } + maybe_resize_storage_cuda(self, storage_size); + + return self; +} + +}} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h new file mode 100644 index 0000000000000000000000000000000000000000..656b4ce2c2bbac167457f31e8f554a5e409a2940 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/Sort.h @@ -0,0 +1,17 @@ +#pragma once +#include +#include +#include + +namespace at { +namespace native { + +inline bool should_use_small_sort(const TensorBase &self, int64_t dim) { + return self.size(dim) <= 4096; +} + +void sortKeyValueInplace( + const TensorBase &key, const TensorBase &value, int dim, + bool descending, bool stable=false); + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..27cc91b152968bcbd1ecdf65da40e4a15494da28 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_amsgrad_impl.cuh @@ -0,0 +1,40 @@ +#pragma once +#include + +namespace at { +namespace native { + +void _fused_adam_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const c10::optional& grad_scale, + const c10::optional& found_inf); + +void _fused_adam_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const c10::optional& grad_scale, + const c10::optional& found_inf); + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..e31f877227c32fd85de677e3b4b3331b322d1a6f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adam_impl.cuh @@ -0,0 +1,38 @@ +#pragma once +#include + +namespace at { +namespace native { + +void _fused_adam_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const c10::optional& grad_scale, + const c10::optional& found_inf); + +void _fused_adam_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const c10::optional& grad_scale, + const c10::optional& found_inf); + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..451545117b5133ea8d7c94e8ce8e7ce0920fd2f1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_amsgrad_impl.cuh @@ -0,0 +1,40 @@ +#pragma once +#include + +namespace at { +namespace native { + +void _fused_adamw_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const c10::optional& grad_scale, + const c10::optional& found_inf); + +void _fused_adamw_amsgrad_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const c10::optional& grad_scale, + const c10::optional& found_inf); + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh new file mode 100644 index 0000000000000000000000000000000000000000..ebe9732346bf4e69b66cb514747af99e4ab249e6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/fused_adamw_impl.cuh @@ -0,0 +1,38 @@ +#pragma once +#include + +namespace at { +namespace native { + +void _fused_adamw_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const c10::optional& grad_scale, + const c10::optional& found_inf); + +void _fused_adamw_cuda_impl_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool maximize, + const c10::optional& grad_scale, + const c10::optional& found_inf); + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh new file mode 100644 index 0000000000000000000000000000000000000000..a38edb538256d6102a4a611e6c9582ddae1901d2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/reduction_template.cuh @@ -0,0 +1,680 @@ +namespace at { +namespace cuda { +//windows doesn't like large string literals, so split in two +const std::string reduction_template_0 = R"ESCAPE( + #define C10_HOST_DEVICE __host__ __device__ + #define C10_DEVICE __device__ + #if defined(__clang__) && defined(__HIP__) + #ifndef __forceinline__ + #define __forceinline__ inline __attribute__((always_inline)) + #endif + // until ROCm support for kernel asserts is restored + #define assert(expr) (static_cast(0)) + #endif + + template + __device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) + { + #if defined(__clang__) && defined(__HIP__) + return __shfl_down(value, delta, width); + #else + return __shfl_down_sync(mask, value, delta, width); + #endif + } + + + #if ${complex} + template + __device__ __forceinline__ std::complex WARP_SHFL_DOWN(std::complex value, unsigned int delta, int width = warpSize, unsigned int mask = 0xffffffff) + { + return std::complex( + #if defined(__clang__) && defined(__HIP__) + __shfl_down(value.real(), delta, width), + __shfl_down(value.imag(), delta, width)); + #else + __shfl_down_sync(mask, value.real(), delta, width), + __shfl_down_sync(mask, value.imag(), delta, width)); + #endif + } + #endif + + // aligned vector generates vectorized load/store on CUDA + template + struct alignas(sizeof(scalar_t) * vec_size) aligned_vector { + scalar_t val[vec_size]; + }; + + + C10_HOST_DEVICE static void reduce_fraction(size_t &numerator, size_t &denominator) { + // get GCD of num and denom using Euclid's algorithm. + // Can replace this with std::gcd if we ever support c++17. + size_t a = denominator; + size_t b = numerator; + while (b != 0) { + a %= b; + // swap(a,b) + size_t tmp = a; + a = b; + b = tmp; + } + + // a is now the GCD + numerator /= a; + denominator /= a; + } + + + + + struct ReduceConfig { + //has to match host-side ReduceConfig in the eager code + static constexpr int BLOCK_X = 0; + static constexpr int BLOCK_Y = 1; + static constexpr int CTA = 2; + + static constexpr int input_vec_size = 4; + int element_size_bytes; + int num_inputs; + int num_outputs; + int step_input = 1; + int step_output = 1; + int ctas_per_output = 1; + int input_mult[3] = {0, 0, 0}; + int output_mult[2] = {0, 0}; + + int block_width; + int block_height; + int num_threads; + + bool vectorize_input = false; + int output_vec_size = 1; + + C10_HOST_DEVICE bool should_block_x_reduce() const { + return input_mult[BLOCK_X] != 0; + } + + C10_HOST_DEVICE bool should_block_y_reduce() const { + return input_mult[BLOCK_Y] != 0; + } + + C10_HOST_DEVICE bool should_global_reduce() const { + return input_mult[CTA] != 0; + } + + C10_DEVICE bool should_store(int output_idx) const { + return output_idx < num_outputs && + (!should_block_x_reduce() || threadIdx.x == 0) && + (!should_block_y_reduce() || threadIdx.y == 0); + } + + C10_DEVICE bool should_reduce_tail() const { + return (!should_block_y_reduce() || threadIdx.y == 0) && + (!should_global_reduce() || blockIdx.y == 0); + } + + C10_HOST_DEVICE int input_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta2 = blockIdx.y; + return (lane * input_mult[BLOCK_X] + + warp * input_mult[BLOCK_Y] + + cta2 * input_mult[CTA]); + } + + template + C10_HOST_DEVICE int output_idx() const { + int lane = threadIdx.x; + int warp = threadIdx.y; + int cta1 = blockIdx.x; + return (lane * output_mult[BLOCK_X] + + warp * output_mult[BLOCK_Y] + + cta1 * step_output) * output_vec_size; + } + + C10_DEVICE int shared_memory_offset(int offset) const { + return threadIdx.x + (threadIdx.y + offset) * blockDim.x; + } + + C10_DEVICE int staging_memory_offset(int cta2) const { + int offset = cta2 + blockIdx.x * gridDim.y; + if (!should_block_x_reduce()) { + offset = threadIdx.x + offset * blockDim.x; + } + return offset; + } + + + }; + + +//TODO this will need to be different for more generic reduction functions +namespace reducer { + + using scalar_t = ${scalar_type}; + using arg_t = ${reduction_accum_type}; + using out_scalar_t = ${result_type}; + + + inline __device__ ${functor} + + inline __device__ out_scalar_t project(arg_t arg) { + return (out_scalar_t) arg; + } + + inline __device__ arg_t warp_shfl_down(arg_t arg, int offset) { + return WARP_SHFL_DOWN(arg, offset); + } + + inline __device__ arg_t translate_idx(arg_t acc, int64_t /*idx*/) { + return acc; + } + + // wrap a normal reduction that ignores the index + inline __device__ arg_t reduce(arg_t acc, arg_t val, int64_t idx) { + return combine(acc, val); + } +} + + +struct ReduceJitOp { + using scalar_t = ${scalar_type}; + using arg_t = ${reduction_accum_type}; + using out_scalar_t = ${result_type}; + + using InputCalculator = OffsetCalculator<1>; + using OutputCalculator = OffsetCalculator<2>; + +// static constexpr bool can_accumulate_in_output = +// std::is_convertible::value +// && std::is_convertible::value; + + static constexpr int input_vec_size = ReduceConfig::input_vec_size; + + arg_t ident; + ReduceConfig config; + InputCalculator input_calc; + OutputCalculator output_calc; + const void* src; + const char* dst[2]; //it accepts at most two destinations + // acc_buf used for accumulation among sub Tensor Iterator when accumulation on + // output is not permissible + void* acc_buf; + // cta_buf used for accumulation between blocks during global reduction + void* cta_buf; + int* semaphores; + int64_t base_idx; + bool accumulate; + bool final_output; + int noutputs; + + + C10_DEVICE void run() const { + extern __shared__ char shared_memory[]; + uint32_t output_idx = config.output_idx<${output_vec_size}>(); + uint32_t input_idx = config.input_idx(); + auto base_offsets1 = output_calc.get(output_idx)[1]; + + using arg_vec_t = Array; + arg_vec_t value; + + if (output_idx < config.num_outputs && input_idx < config.num_inputs) { + const scalar_t* input_slice = (const scalar_t*)((const char*)src + base_offsets1); + + value = thread_reduce<${output_vec_size}>(input_slice); + } + + if (config.should_block_y_reduce()) { + value = block_y_reduce<${output_vec_size}>(value, shared_memory); + } + if (config.should_block_x_reduce()) { + value = block_x_reduce<${output_vec_size}>(value, shared_memory); + } + + using out_ptr_vec_t = Array; + using offset_vec_t = Array; + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + arg_vec_t* acc = nullptr; + if (acc_buf != nullptr) { + size_t numerator = sizeof(arg_t); + size_t denominator = sizeof(out_scalar_t); + reduce_fraction(numerator, denominator); + acc = (arg_vec_t*)((char*)acc_buf + (base_offsets[0] * numerator / denominator)); + } + + if (config.should_global_reduce()) { + value = global_reduce<${output_vec_size}>(value, acc, shared_memory); + } else if (config.should_store(output_idx)) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + value[i] = reducer::translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output<${output_vec_size}>(out, value); + } + if (final_output) { + set_results_to_output<${output_vec_size}>(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < ${output_vec_size}; i++) { + value[i] = reducer::combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output<${output_vec_size}>(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + template + C10_DEVICE Array thread_reduce(const scalar_t* data) const { + if (config.vectorize_input) { + assert(output_vec_size == 1); + // reduce at the header of input_slice where memory is not aligned, + // so that thread_reduce will have an aligned memory to work on. + return {input_vectorized_thread_reduce_impl(data)}; + } else { + uint32_t element_stride = input_calc.strides_[0][0] / sizeof(scalar_t); + bool is_contiguous = (input_calc.dims == 1 && element_stride == 1); + if (is_contiguous) { + return thread_reduce_impl(data, [](uint32_t idx) { return idx; }); + } else if (input_calc.dims == 1) { + return thread_reduce_impl(data, [&](uint32_t idx) { return idx * element_stride; }); + } else { + return thread_reduce_impl(data, [&](uint32_t idx) { return input_calc.get(idx)[0] / sizeof(scalar_t); }); + } + } + } + + C10_DEVICE arg_t input_vectorized_thread_reduce_impl(const scalar_t* data) const { + uint32_t end = config.num_inputs; + + // Handle the head of input slice where data is not aligned + arg_t value = ident; + constexpr int align_bytes = alignof(aligned_vector); + constexpr int align_elements = align_bytes / sizeof(scalar_t); + int shift = ((int64_t)data) % align_bytes / sizeof(scalar_t); + if (shift > 0) { + data -= shift; + end += shift; + if(threadIdx.x >= shift && threadIdx.x < align_elements && config.should_reduce_tail()){ + value = reducer::reduce(value, data[threadIdx.x], threadIdx.x - shift); + } + end -= align_elements; + data += align_elements; + shift = align_elements - shift; + } + + // Do the vectorized reduction + using load_t = aligned_vector; + + uint32_t idx = config.input_idx(); + const uint32_t stride = config.step_input; + + // Multiple accumulators to remove dependency between unrolled loops. + arg_t value_list[input_vec_size]; + value_list[0] = value; + + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[i] = ident; + } + + scalar_t values[input_vec_size]; + + load_t *values_vector = reinterpret_cast(&values[0]); + + while (idx * input_vec_size + input_vec_size - 1 < end) { + *values_vector = reinterpret_cast(data)[idx]; + #pragma unroll + for (uint32_t i = 0; i < input_vec_size; i++) { + value_list[i] = reducer::reduce(value_list[i], values[i], shift + idx * input_vec_size + i); + } + idx += stride; + } + + // tail + uint32_t tail_start = end - end % input_vec_size; + if (config.should_reduce_tail()) { + int idx = tail_start + threadIdx.x; + if (idx < end) { + value_list[0] = reducer::reduce(value_list[0], data[idx], idx + shift); + } + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < input_vec_size; i++) { + value_list[0] = reducer::combine(value_list[0], value_list[i]); + } + return value_list[0]; + } + + template + C10_DEVICE Array thread_reduce_impl(const scalar_t* data_, offset_calc_t calc) const { + uint32_t idx = config.input_idx(); + const uint32_t end = config.num_inputs; + const uint32_t stride = config.step_input; + const int vt0=${vt0}; + + using arg_vec_t = Array; + using load_t = aligned_vector; + const load_t* data = reinterpret_cast(data_); + + // Multiple accumulators to remove dependency between unrolled loops. + arg_vec_t value_list[vt0]; + + #pragma unroll + for (int i = 0; i < vt0; i++) { + #pragma unroll + for (int j = 0; j < output_vec_size; j++) { + value_list[i][j] = ident; + } + } + + load_t values[vt0]; + + while (idx + (vt0 - 1) * stride < end) { + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + values[i] = data[calc(idx + i * stride) / output_vec_size]; + } + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + #pragma unroll + for (uint32_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx + i * stride); + } + } + idx += stride * vt0; + } + + // tail + int idx_ = idx; + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + values[i] = data[calc(idx) / output_vec_size]; + idx += stride; + } + idx = idx_; + #pragma unroll + for (uint32_t i = 0; i < vt0; i++) { + if (idx >= end) { + break; + } + #pragma unroll + for (uint32_t j = 0; j < output_vec_size; j++) { + value_list[i][j] = reducer::reduce(value_list[i][j], values[i].val[j], idx); + } + idx += stride; + } + + // combine accumulators + #pragma unroll + for (int i = 1; i < vt0; i++) { + #pragma unroll + for (uint32_t j = 0; j < output_vec_size; j++) { + value_list[0][j] = reducer::combine(value_list[0][j], value_list[i][j]); + } + } + return value_list[0]; + } + template + C10_DEVICE Array block_x_reduce(Array value, char* shared_memory) const { + using args_vec_t = Array; + int dim_x = blockDim.x; + args_vec_t* shared = (args_vec_t*)shared_memory; + if (dim_x > warpSize) { + int address_base = threadIdx.x + threadIdx.y*blockDim.x; + shared[address_base] = value; + for (int offset = dim_x/2; offset >= warpSize; offset >>= 1) { + __syncthreads(); + if (threadIdx.x < offset && threadIdx.x + offset < blockDim.x) { + args_vec_t other = shared[address_base + offset]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], other[i]); + } + shared[address_base] = value; + } + } + dim_x = warpSize; + } + + __syncthreads(); + + for (int offset = 1; offset < dim_x; offset <<= 1) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + arg_t other = reducer::warp_shfl_down(value[i], offset); + value[i] = reducer::combine(value[i], other); + } + } + return value; + } + + template + C10_DEVICE Array block_y_reduce(Array value, char* shared_memory) const { + using args_vec_t = Array; + args_vec_t* shared = (args_vec_t*)shared_memory; + shared[config.shared_memory_offset(0)] = value; + for (int offset = blockDim.y / 2; offset > 0; offset >>= 1) { + __syncthreads(); + if (threadIdx.y < offset && threadIdx.y + offset < blockDim.y) { + args_vec_t other = shared[config.shared_memory_offset(offset)]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], other[i]); + } + shared[config.shared_memory_offset(0)] = value; + } + } + return value; + } + )ESCAPE"; + + const std::string reduction_template_1 = R"ESCAPE( + + C10_DEVICE bool mark_block_finished() const { + __shared__ bool is_last_block_done_shared; + + __syncthreads(); + if (threadIdx.x == 0 && threadIdx.y == 0) { + int prev_blocks_finished = atomicAdd(&semaphores[blockIdx.x], 1); + is_last_block_done_shared = (prev_blocks_finished == gridDim.y - 1); + } + + __syncthreads(); + + return is_last_block_done_shared; + } + + template + C10_DEVICE Array accumulate_in_output( + Array out, + Array value + ) const { + Array ret; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + ret[i] = reducer::combine(*(out[i]), value[i]); + } + return ret; + } + + + C10_DEVICE out_scalar_t get_accumulated_output( + out_scalar_t* out, arg_t value + ) const { + assert(!final_output); + return (out_scalar_t)value; + } + + template + C10_DEVICE void set_results(const T x, const uint32_t base_offset) const { + assert(noutputs == 1); + auto res = (out_scalar_t*)((char*)dst[0] + base_offset); + *res = x; + } + +//TODO - multi-output reduction - we won't be able to use thrust::pair +//just explicitly specify typed output reads/writes +//Currently implemented for max of two outputs +// template +// C10_DEVICE void set_results(const thrust::pair x, const index_t base_offset) const { +// if (noutputs >= 1) { +// auto res0 = (T1*)((char*)dst[0] + base_offset); +// *res0 = x.first; +// } +// if (noutputs >= 2) { +// // base offset is computed assuming element size being sizeof(T1), so we need to make a +// // correction to obtain the correct base offset +// auto res1 = (T2*) ((char *) dst[1] + base_offset / sizeof(T1) * sizeof(T2)); +// *res1 = x.second; +// } +// } + + template + C10_DEVICE void set_results_to_output(Array value, Array base_offset) const { + assert(final_output); + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + set_results(reducer::project(value[i]), base_offset[i]); + } + } + + template + C10_DEVICE Array global_reduce(Array value, Array *acc, char* shared_memory) const { + using arg_vec_t = Array; + using out_ptr_vec_t = Array; + using offset_vec_t = Array; + + arg_vec_t* reduce_buffer = (arg_vec_t*)cta_buf; + uint32_t output_idx = config.output_idx(); + offset_vec_t base_offsets; + out_ptr_vec_t out; + + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + base_offsets[i] = output_calc.get(output_idx + i)[0]; + out[i] = (out_scalar_t*)((char*)dst[0] + base_offsets[i]); + } + + bool should_store = config.should_store(output_idx); + if (should_store) { + uint32_t offset = config.staging_memory_offset(blockIdx.y); + reduce_buffer[offset] = value; + } + + __threadfence(); // make sure writes are globally visible + __syncthreads(); // if multiple warps in this block wrote to staging, make sure they're all done + bool is_last_block_done = mark_block_finished(); + + if (is_last_block_done) { + value = ident; + if (config.should_block_x_reduce()) { + uint32_t input_offset = threadIdx.x + threadIdx.y * blockDim.x; + uint32_t step = blockDim.x * blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + uint32_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], next[i]); + } + } + } else { + uint32_t input_offset = threadIdx.y; + uint32_t step = blockDim.y; + for (; input_offset < config.ctas_per_output; input_offset += step) { + uint32_t idx = config.staging_memory_offset(input_offset); + arg_vec_t next = reduce_buffer[idx]; + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine(value[i], next[i]); + } + } + } + value = block_y_reduce(value, shared_memory); + if (config.should_block_x_reduce()) { + value = block_x_reduce(value, shared_memory); + } + if (should_store) { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::translate_idx(value[i], base_idx); + } + } + + if (acc == nullptr) { + if (accumulate) { + value = accumulate_in_output(out, value); + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + *(out[i]) = get_accumulated_output(out[i], value[i]); + } + } + } else { + if (accumulate) { + #pragma unroll + for (int i = 0; i < output_vec_size; i++) { + value[i] = reducer::combine((*acc)[i], value[i]); + } + } + if (final_output) { + set_results_to_output(value, base_offsets); + } else { + *acc = value; + } + } + } + } + + return value; + } +}; + +extern "C" +__launch_bounds__(${max_threads_lb}, 4) +__global__ void reduction_${name}_kernel(ReduceJitOp r){ + r.run(); +} +)ESCAPE"; + +const std::string reduction_template = reduction_template_0 + reduction_template_1; + + +const std::string &get_reduction_template() { + return reduction_template; +} + +}} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/thread_constants.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/thread_constants.h new file mode 100644 index 0000000000000000000000000000000000000000..651053d663e4c204753cdfa4ae31ed60fed34152 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/thread_constants.h @@ -0,0 +1,22 @@ +#pragma once +#include + +// Marks a lambda as executable on both the host and device. The __host__ +// attribute is important so that we can access static type information from +// the host, even if the function is typically only executed on the device. +#ifndef GPU_LAMBDA +#define GPU_LAMBDA __host__ __device__ +#endif + +#if defined(USE_ROCM) +constexpr int num_threads() { + return 256; +} +#else +constexpr uint32_t num_threads() { + return C10_WARP_SIZE * 4; +} +#endif + +constexpr int thread_work_size() { return 4; } +constexpr int block_work_size() { return thread_work_size() * num_threads(); } diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/OperationUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/OperationUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..8384f4391f9a854c45f625b171dc28d6e2400e4a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/OperationUtils.h @@ -0,0 +1,394 @@ +// Copyright © 2022 Apple Inc. + +#pragma once + +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + +#include + +// Fwd declarations +namespace at { + struct TensorIteratorBase; +} +using namespace at::mps; + +namespace at::native::mps { + +void dispatch_sync_with_rethrow(dispatch_queue_t queue, void (^block)()); + +struct MPSScalar { + id getMTLBuffer() const { return __builtin_bit_cast(id, buffer.get()); } + + size_t size = 0; + ScalarType type = ScalarType::Undefined; + c10::DataPtr buffer; // stores MTLBuffer (frees buffer if MPSScalar instance goes out of scope) + union { + float f; // MPS doesn't support 'double' + at::Half h; + int64_t i; + bool b; + c10::complex cf; + c10::complex ch; + at::BFloat16 bf16; + } value {}; +}; + +void runMPSGraph(MPSStream* mpsStream, + MPSGraph* mpsGraph, + NSDictionary* feeds, + NSDictionary* results); + +MPSDataType getMPSDataType(ScalarType scalar_type); +static inline MPSDataType getMPSDataType(const Tensor& t) { + return getMPSDataType(t.scalar_type()); +} +MPSDataType getMPSScalarType(ScalarType scalar_type); +static inline MPSDataType getMPSScalarType(const Tensor& t) { + return getMPSScalarType(t.scalar_type()); +} +MPSScalar getMPSScalar(const Scalar& scalar, ScalarType type); +std::string getMPSTypeString(ScalarType scalar_type, bool short_name = false); +static inline std::string getMPSTypeString(const Tensor& t, bool short_name = false) { + return getMPSTypeString(t.scalar_type(), short_name); +} +std::string scalarToMetalTypeString(const c10::ScalarType& scalar_type); +NSArray* getTensorAxes(const Tensor& t); +NSArray* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim); +std::string getMPSShapeString(MPSShape* shape); +std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true); +std::string getArrayRefString(const IntArrayRef s); +// use has_storage() on the returned tensor to determine if src actually is a view +Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst); +Tensor& scatterViewTensor(const at::Tensor& src, at::Tensor& output); +bool canSliceViewTensor(const Tensor& src, MPSShape *mpsShape); +MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mpsShape, const MPSDataType mpsDataType); +MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input, bool includesInt64 = false); +MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input, bool includesInt64 = false); + +// The MPSShape could vary based on memory format +MPSShape* getMPSShape(const Tensor& t, c10::MemoryFormat memory_format = MemoryFormat::Contiguous); +MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format = MemoryFormat::Contiguous); + +static inline id getMTLBufferStorage(const at::Tensor& tensor) { + return __builtin_bit_cast(id, tensor.storage().data()); +} + +class Placeholder { + public: + Placeholder() : _placeholder(nullptr), _value(nullptr), _tensor(Tensor()) {} + Placeholder(MPSGraphTensor* mpsGraphTensor) : _placeholder(mpsGraphTensor), _value(nullptr), _tensor(Tensor()) {} + Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape = nullptr, + bool gatherTensorData = true, MPSDataType dataType = MPSDataTypeInvalid); + MPSGraphTensor* getMPSGraphTensor() { + return _placeholder; + } + MPSGraphTensorData* getMPSGraphTensorData() { + return _value; + } + bool isIntermediate() { + return _value == nullptr; + } + + private: + MPSGraphTensor* _placeholder; + MPSGraphTensorData* _value; + Tensor _tensor; +}; + +void resize_tensor(Tensor* output); +Tensor wrapped_scalar_tensor_mps(const Scalar& scalar, const Device device); +MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor); +MPSGraphTensor* convertNHWCtoNCHW(MPSGraph *mpsGraph, MPSGraphTensor* tensor); +MPSGraphTensor* castMPSTensor(MPSGraph *mpsGraph, MPSGraphTensor* tensor, ScalarType toType); +MPSGraphTensor* castMPSTensor(MPSGraph *mpsGraph, MPSGraphTensor* tensor, MPSDataType toType); +MPSGraphTensorData *getMPSGraphTensorData(MPSGraph* mpsGraph, MPSStream* mpsStream, const Tensor& tensor); +MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar); + +MPSGraph* make_mps_graph(); +void printTensorNDArray(const Tensor& t); +MPSNDArray* ndArrayFromTensor(const Tensor& tensor, MPSShape *shape, MPSDataType mpsType); + +MPSGraphTensor* mpsGraphUnrankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType); +MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType, MPSShape* mpsShape); +MPSGraphTensor* mpsGraphRankedPlaceHolder(MPSGraph *mpsGraph, const Tensor& tensor); +MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph *mpsGraph, MPSDataType dataType); +MPSGraphTensor* mpsGraphScalarPlaceHolder(MPSGraph *mpsGraph, const Scalar& scalar); + +string get_mem_format_string(c10::MemoryFormat memory_format); + +using MPSCacheKey = uint64_t; + +// derive this class to cache a graph and its inputs/outputs +// can be used to store any NSObject +struct MPSCachedGraph +{ + MPSCachedGraph(NSObject *object) : _object([object retain]) {} + virtual ~MPSCachedGraph() { + [_object release]; + _object = nullptr; + } + + template + inline T* as() { + return static_cast(this); + } + + MPSGraph *graph() const { return (MPSGraph *)_object; } + NSObject *object() const { return _object; } +private: + NSObject *_object = nullptr; +}; + +struct MPSUnaryCachedGraph : public MPSCachedGraph +{ + MPSUnaryCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; +}; + +struct MPSUnaryGradCachedGraph : public MPSCachedGraph +{ + MPSUnaryGradCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *gradOutputTensor_ = nil; + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; // some backward input is actually the forward's output + MPSGraphTensor *gradInputTensor_ = nil; +}; + +struct MPSBinaryCachedGraph : public MPSCachedGraph +{ + MPSBinaryCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *otherTensor_ = nil; + MPSGraphTensor *outputTensor_ = nil; +}; + +struct MPSBinaryGradCachedGraph : public MPSCachedGraph +{ + MPSBinaryGradCachedGraph(MPSGraph *graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *gradOutputTensor_ = nil; + MPSGraphTensor *inputTensor_ = nil; + MPSGraphTensor *otherTensor_ = nil; + MPSGraphTensor *gradInputTensor_ = nil; +}; + +// TODO: Improve the overall design of MPSGraphCache. +// https://github.com/pytorch/pytorch/issues/77176 +// Cache holding various keys mapped to graphs +struct MPSGraphCache +{ + typedef MPSCachedGraph * (^CreateCachedGraphBlock)(); + + struct CacheEntry { + CacheEntry(const std::string& key, MPSCachedGraph *cachedGraph) : cachedGraph_(cachedGraph), key_(key) {} + MPSCachedGraph* cachedGraph_ = nullptr; + std::string key_; + }; + + public: + + static MPSGraphCache* getInstance() { + if(_instance_cache == nullptr) { + _instance_cache = new MPSGraphCache(); + } + return _instance_cache; + } + + ~MPSGraphCache() { + dispatch_release(serialQueue_); + + for (const auto& i : cache_) { + delete i.second.cachedGraph_; + } + } + + // Disallow the copy constructor and operator= functions + MPSGraphCache(const MPSGraphCache&) = delete; + void operator=(const MPSGraphCache&) = delete; + + MPSCachedGraph* CreateCachedGraph(const std::string& key, CreateCachedGraphBlock createCacheBlock) { + + __block MPSCachedGraph* cachedGraph = nil; + + MPSCacheKey hash = std::hash{}(key); + + dispatch_sync_with_rethrow(serialQueue_, ^() { + // verify the cached entry doesn't already exist + if (cache_.count(hash) != 0) { + auto& entry = cache_.at(hash); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n"); + cachedGraph = entry.cachedGraph_; + } else { + cachedGraph = createCacheBlock(); + CacheEntry entry(key, cachedGraph); + cache_.emplace(hash, entry); + profileCachedGraph(entry); + } + }); + return cachedGraph; + } + + template + inline T* CreateCachedGraphAs(const std::string& key, CreateCachedGraphBlock createCacheBlock) { + return static_cast(CreateCachedGraph(key, createCacheBlock)); + } + + MPSCachedGraph* LookUp(const std::string& key) const { + + __block MPSCachedGraph* cachedGraph = nullptr; + + MPSCacheKey hash = std::hash{}(key); + + dispatch_sync(serialQueue_, ^() { + + if (cache_.count(hash) != 0) { + auto& entry = cache_.at(hash); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n"); + cachedGraph = entry.cachedGraph_; + profileCachedGraph(entry); + } + }); + return cachedGraph; + } + + template + inline T* LookUpAs(const std::string& key) const { + return static_cast(LookUp(key)); + } + + private: + MPSGraphCache() { + serialQueue_ = dispatch_queue_create("cache queue", DISPATCH_QUEUE_SERIAL); + } + // this is defined in OperationUtils.mm to not include + // MPSProfiler.h in header OperationUtils.h + void profileCachedGraph(const CacheEntry& cacheEntry) const; + + static MPSGraphCache* _instance_cache; + std::unordered_map cache_; + dispatch_queue_t serialQueue_ = nullptr; + +}; + +// Common template for creating graph with a specified cache if missing +template +inline T* LookUpOrCreateCachedGraph(const std::string& key, std::function instantiate) { + auto cache_ = MPSGraphCache::getInstance(); + if (auto rc = cache_->LookUpAs(key)) { + return rc; + } + return cache_->CreateCachedGraphAs(key, ^mps::MPSCachedGraph*() { + T* newCachedGraph = nil; + @autoreleasepool { + // Initialize graph + auto mpsGraph = mps::make_mps_graph(); + newCachedGraph = new T(mpsGraph); + instantiate(mpsGraph, newCachedGraph); + } + return newCachedGraph; + }); +} + +// Common math operations +MPSGraphTensor* log1p(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor); + +#define MPS_CHECK_INT64_OP_SUPPORTED(input_tensor, mac_os_13_3_plus, op_name) \ + if (!mac_os_13_3_plus && input_tensor.scalar_type() == kLong) { \ + TORCH_WARN_ONCE("MPS: no support for int64 for ", op_name, \ + ", downcasting to a smaller data type (int32/float32). Native support for int64 has been added in macOS 13.3."); \ + } + +/** + * Returns distance from lowest to highest element offset in given tensor. + */ +size_t compute_storage_numel_distance(const at::Tensor& t); + +/** + * Checks whether tensor is mapped to a contiguous area in the storage. + */ +inline bool is_dense_in_storage(const at::Tensor& t) { + return compute_storage_numel_distance(t) == static_cast(t.numel()); +} + +static inline void mtl_setBuffer(id encoder, const Tensor& t, unsigned idx) { + [encoder setBuffer:getMTLBufferStorage(t) + offset:t.storage_offset() * t.element_size() + atIndex:idx]; +} + +static inline void mtl_dispatch1DJob(id encoder, + id cplState, + uint32_t length) { + const uint32_t maxThreadsPerGroup = [cplState maxTotalThreadsPerThreadgroup]; + auto size = MTLSizeMake(length, 1, 1); + auto threadGroupSize = MTLSizeMake(std::min(maxThreadsPerGroup, length), 1, 1); + [encoder dispatchThreads:size threadsPerThreadgroup:threadGroupSize]; +} + +id generateKernelDataOffsets(id commandEncoder, const TensorIteratorBase& iter, bool use_64bit_index = false); + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1) { + return @{ p1.getMPSGraphTensor(): p1.getMPSGraphTensorData() }; +} + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2) { + return @{ + p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(), + p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(), + }; +} + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3) { + return @{ + p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(), + p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(), + p3.getMPSGraphTensor(): p3.getMPSGraphTensorData(), + }; +} + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3, Placeholder& p4) { + return @{ + p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(), + p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(), + p3.getMPSGraphTensor(): p3.getMPSGraphTensorData(), + p4.getMPSGraphTensor(): p4.getMPSGraphTensorData(), + }; +} + +inline void runMPSGraph(MPSStream* stream, MPSGraph* graph, NSDictionary* feeds, Placeholder& result) { + runMPSGraph(stream, graph, feeds, dictionaryFromPlaceholders(result)); +} + +inline bool supportsComplex() { + return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS); +} + +// MPS yet to support double types, but starting from MacOS 14, supports bfloat16 +inline bool supportedFloatingType(ScalarType dtype) { + return dtype == kFloat || dtype == kHalf || dtype == kBFloat16; +} + +inline bool supportedFloatingType(const Tensor& t) { + return supportedFloatingType(t.scalar_type()); +} + +} // namespace at::native::mps diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/TensorFactory.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/TensorFactory.h new file mode 100644 index 0000000000000000000000000000000000000000..e6c9da0babbbedc71e41820aabf7c1c71274bd44 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/TensorFactory.h @@ -0,0 +1,12 @@ +// Copyright © 2022 Apple Inc. + +#define AT_DISPATCH_MPS_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, NAME, \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)) diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..cee721d7bc8f6e560b8ad9ef2e995fc23708b25f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerFunctions.h @@ -0,0 +1,103 @@ +/** + * Transformer-specific NestedTensor utility functions. + * + * Not co-located with NestedTensor core code yet because they only + * support specific cases needed in transformers. + */ +#pragma once + +#include + +#include +#include + +namespace c10 { +class Scalar; +} // namespace c10 + +namespace at { +class Tensor; +namespace native { +struct NestedTensorImpl; + +// Requires that self is a contiguous NestedTensor, other is not a +// NestedTensor, self.dim() == 3, and other.dim() == 2. Also, self +// must have a consistent last dimension across its included Tensors +// and that dimension must match other.size(0). +Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other); + +// Requires that mat1 is a contiguous NestedTensor, self & mat2 are +// not NestedTensors, mat1.dim() == 3, mat2.dim() == 2, and that mat1 +// has a consistent last dimension across its included Tensors that +// matches mat2.size(0). +Tensor NestedTensor_times_Tensor_plus_Tensor_addmm( + const Tensor& self, + const Tensor& mat1, + const Tensor& mat2, + const c10::Scalar& beta, + const c10::Scalar& alpha, + c10::optional use_gelu = c10::nullopt); + +Tensor NestedTensor_add_NestedTensor_in_place( + const Tensor& self, + const Tensor& other); + +TORCH_API Tensor NestedTensor_batch_offsets_from_size_tensor( + const Tensor& sizes, + int64_t extra_elements); + +Tensor NestedTensor_from_padded_tensor_cpu( + const Tensor& padded, + const NestedTensorImpl& nt); + +Tensor NestedTensor_to_mask(const Tensor& nt, c10::optional mask_dim, c10::optional mask_dim_length); + +template +void remove_padding_kernelLauncher( + const T* input, + T* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int output_dim, + const int batch_size); + +template +void remove_padding_transform0213_kernelLauncher( + const T* input, + T* output, + const int* offsets, + const int* input_sizes, + const int* output_sizes, + int output_dim, + const int batch_size); + +template +void add_padding_kernelLauncher( + T* input, + T* output, + T padding_value, + const int* offsets, + const int* input_sizes, + int input_dim, + const std::vector& output_sizes, + const int batch_size, + const int output_batch_size); + +TORCH_API Tensor flash_attention_helper( + const Tensor& query, + const Tensor& key, + const Tensor& value, + double dropout_p, + bool need_attn_weights, + bool is_causal); + +TORCH_API std::tuple mem_efficient_helper_nested_unpacked( + const Tensor& query, + const Tensor& key, + const Tensor& value, + double dropout_p, + bool need_attn_weights, + bool is_causal); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h new file mode 100644 index 0000000000000000000000000000000000000000..1ff342a643c38590255d68d10133f171c176f836 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h @@ -0,0 +1,130 @@ +#pragma once + +#include +#include +#include +#include + +namespace at { +namespace native { + +Tensor& quantize_tensor_per_tensor_affine( + const Tensor& rtensor, + Tensor& qtensor, + double scale, + int64_t zero_point); +Tensor& quantize_tensor_per_channel_affine( + const Tensor& rtensor, + Tensor& qtensor, + Tensor scales, + Tensor zero_points, + int64_t axis); + +Tensor& quantize_tensor_per_channel_float_qparams( + const Tensor& rtensor, + Tensor& qtensor, + Tensor scales, + Tensor zero_points, + int64_t axis); + +Tensor& dequantize_tensor_per_tensor_affine( + const Tensor& qtensor, + Tensor& rtensor, + double scale, + int64_t zero_point); +Tensor& dequantize_tensor_per_channel_affine( + const Tensor& qtensor, + Tensor& rtensor, + Tensor scales, + Tensor zero_points, + int64_t axis); +Tensor& dequantize_tensor_per_channel_float_qparams( + const Tensor& qtensor, + Tensor& rtensor, + Tensor scales, + Tensor zero_points, + int64_t axis); + +using quantize_tensor_per_tensor_affine_fn = + void (*)(const Tensor& rtensor, Tensor& qtensor, double scale, int64_t zero_point); + +using quantize_tensor_per_channel_affine_fn = void (*)( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using quantize_tensor_per_channel_float_qparams_fn = void (*)( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using dequantize_tensor_per_tensor_affine_fn = + void (*)(const Tensor& qtensor, Tensor& rtensor, double scale, int64_t zero_point); + +using dequantize_tensor_per_channel_affine_fn = void (*)( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using dequantize_tensor_per_channel_float_qparams_fn = void (*)( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using quantize_tensor_per_tensor_affine_sub_byte_fn = + void (*)(const Tensor& rtensor, Tensor& qtensor, float scale, float zero_point); + +using dequantize_tensor_per_tensor_affine_sub_byte_fn = + void (*)(const Tensor& qtensor, Tensor& rtensor, float scale, float zero_point); + +DECLARE_DISPATCH( + quantize_tensor_per_tensor_affine_fn, + quantize_tensor_per_tensor_affine_stub); +DECLARE_DISPATCH( + quantize_tensor_per_channel_affine_fn, + quantize_tensor_per_channel_affine_stub); +DECLARE_DISPATCH( + quantize_tensor_per_channel_float_qparams_fn, + quantize_tensor_per_channel_float_qparams_stub); + +DECLARE_DISPATCH( + dequantize_tensor_per_tensor_affine_fn, + dequantize_tensor_per_tensor_affine_stub); +DECLARE_DISPATCH( + dequantize_tensor_per_channel_affine_fn, + dequantize_tensor_per_channel_affine_stub); +DECLARE_DISPATCH( + dequantize_tensor_per_channel_float_qparams_fn, + dequantize_tensor_per_channel_float_qparams_stub); + +DECLARE_DISPATCH( + quantize_tensor_per_tensor_affine_sub_byte_fn, + quantize_tensor_per_tensor_affine_sub_byte_stub); + +DECLARE_DISPATCH( + dequantize_tensor_per_tensor_affine_sub_byte_fn, + dequantize_tensor_per_tensor_affine_sub_byte_stub); + +template +TORCH_API Tensor quantize_tensor( + Tensor rtensor, + Tensor qtensor, + double scale, + int64_t zero_point); +template +TORCH_API Tensor dequantize_tensor( + Tensor qtensor, + Tensor rtensor, + double scale, + int64_t zero_point); + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..6f8ff918c1d2f3e421922650161aaa41eda9545f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h @@ -0,0 +1,62 @@ +#pragma once +#include +#include + +namespace at::native::quantized { +namespace { +// MakeConvOutputShape used from both CPU and CUDA libraries +// and exporting symbol from torch_cpu would probably take more storage +// than duplicating implementation which likely be inlined away +template +at::SmallVector MakeConvOutputShape( + int N, // mini-batch + int M, // output channels + const std::array& input_image_shape, + const std::vector& kernel, + const torch::List& stride, + const torch::List& padding, + const torch::List& dilation); + +#if defined(USE_CUDA) || defined(USE_PYTORCH_QNNPACK) +template <> +at::SmallVector MakeConvOutputShape<2>( + int N, // mini-batch + int M, // output channels + const std::array& input_image_shape, + const std::vector& kernel, + const at::List& stride, + const at::List& padding, + const at::List& dilation) { + const int H = input_image_shape[0]; + const int W = input_image_shape[1]; + const int64_t Y_H = + (H + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1; + const int64_t Y_W = + (W + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1; + return {N, M, Y_H, Y_W}; +} + +template <> +at::SmallVector MakeConvOutputShape<3>( + int N, // mini-batch + int M, // output channels + const std::array& input_image_shape, + const std::vector& kernel, + const at::List& stride, + const at::List& padding, + const torch::List& dilation) { + const int D = input_image_shape[0]; + const int H = input_image_shape[1]; + const int W = input_image_shape[2]; + const int64_t Y_D = + (D + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1; + const int64_t Y_H = + (H + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1; + const int64_t Y_W = + (W + 2 * padding[2] - dilation[2] * (kernel[2] - 1) - 1) / stride[2] + 1; + return {N, M, Y_D, Y_H, Y_W}; +} + +#endif +} // anonymous namespace +} // namespace at::native::quantized diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..0e240b5a8e9afc61f8828f4162f1b89c7ec06bb7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h @@ -0,0 +1,14 @@ +#pragma once +#include + +namespace at { +namespace native { +using masked_fill_kernel_quantized_fn = void(*)(TensorIterator& iter, const Scalar& value, double scale, int zero_point); +using index_put_kernel_quantized_fn = void(*)(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate, double scale, int zero_point); + +DECLARE_DISPATCH(masked_fill_kernel_quantized_fn, masked_fill_kernel_quantized_stub); +DECLARE_DISPATCH(index_put_kernel_quantized_fn, index_put_kernel_quantized_stub); + + +} // native +} // at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h new file mode 100644 index 0000000000000000000000000000000000000000..a442628573fecd18da0afdebc10dba2bd9faf8b1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h @@ -0,0 +1,147 @@ +#pragma once + +#include +#include + +struct LinearPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) = 0; + + // out variant of LinearPackedParamsBase::apply + virtual at::Tensor& apply_out( + const at::Tensor& /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/, + at::Tensor& output) { + throw std::runtime_error( + "apply_out is not implemented for this packed " + "parameter type"); + return output; + } + + virtual at::Tensor& apply_relu_out( + const at::Tensor& /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/, + at::Tensor& output) { + throw std::runtime_error( + "apply_relu_out is not implemented for this packed " + "parameter type"); + return output; + } + + // Corresponding pattern (the ops with `*` are part of the pattern that + // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_output_fp32): + // input -> q* -> dq* -> linear* -> + // qweight -> dq* / + // + // After fusion: + // input -> quantized::linear_with_input_q_dq_qweight_dq_output_fp32* -> + // qweight / + // + // Additional Note: the weight is packed as well + // Params: + // X: float32 Tensor, will be quantized to quint8 in the op + // W_prepack: packed qint8 quantized weight and bias + // Returns: + // Y: float32 Tensor + virtual at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) { + throw std::runtime_error( + "apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed " + "parameter type"); + return {}; + } + + // Corresponding pattern (the ops with `*` are part of the pattern that + // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32): + // input -> q* -> dq* -> linear* -> relu* -> + // qweight -> dq* / + // + // After fusion: + // input -> quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32* -> + // qweight / + // + // Additional Note: the weight is packed as well + // Params: + // input: float32 Tensor, will be quantized to quint8 in the op + // Returns: + // float32 Tensor + virtual at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) { + throw std::runtime_error( + "apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed " + "parameter type"); + return {}; + } + + virtual at::Tensor apply_dynamic( + at::Tensor input, + bool reduce_range = false) = 0; + virtual at::Tensor apply_dynamic_relu( + at::Tensor input, + bool reduce_range = false) = 0; + + virtual at::Tensor& apply_dynamic_out( + const at::Tensor& /* input */, + at::Tensor& output, + bool /* reduce_range */) { + throw std::runtime_error( + "apply_dynamic_out is not implemented for this packed " + "parameter type"); + return output; + } + virtual at::Tensor& apply_dynamic_relu_out( + const at::Tensor& /* input */, + at::Tensor& output, + bool /* reduce_range */) { + throw std::runtime_error( + "apply_dynamic_relu_out is not implemented for this packed " + "parameter type"); + return output; + } + + virtual std::tuple> unpack() = 0; + + virtual c10::optional bias() = 0; + + virtual void set_bias(c10::optional /*bias*/) { + throw std::runtime_error( + "set_bias is not implemented for this packed " + "parameter type"); + } +}; + +template +struct ConvPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) = 0; + + virtual std::tuple> unpack() = 0; + + virtual torch::List stride() const = 0; + virtual torch::List padding() const = 0; + virtual torch::List output_padding() const = 0; + virtual torch::List dilation() const = 0; + virtual int64_t groups() const = 0; + virtual bool transpose() const = 0; +}; diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h new file mode 100644 index 0000000000000000000000000000000000000000..140b716df2691db2fcc6bab0d52ad380ef10067e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +struct EmbeddingPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor embeddingbag_byte( + const at::Tensor& indices, + const c10::optional& offsets, + bool pruned_weights, + const c10::optional& per_sample_weights_, + const c10::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) = 0; + + virtual at::Tensor embeddingbag_4bit( + const at::Tensor& indices, + const c10::optional& offsets, + bool pruned_weights, + const c10::optional& per_sample_weights_, + const c10::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) = 0; + + virtual at::Tensor unpack() = 0; + + virtual int64_t bit_rate() const = 0; + virtual int64_t version() const = 0; +}; diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..88ff258be891fb320cfdc98ebb1d5363c055d631 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h @@ -0,0 +1,527 @@ +#pragma once + +#ifdef USE_PYTORCH_QNNPACK +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +#include +inline int kPaddingChannels = 8; +struct QnnpackOperatorDeleter { + void operator()(pytorch_qnnp_operator_t op) { + pytorch_qnnp_delete_operator(op); + } +}; + +// PackedWeight struct for QNNPACK stores the original Weight and Bias as +// QNNPACK currently does not support an unpack function. +// For PyTorch Mobile, once the model is scripted and serialized we don't need +// to call unpack, so we can save some memory by checking for this case and free +// the original weights after packing. +// Input scale is set to null in pre-pack step. QNNPACK needs bias quantized +// with input scale which is available at runtime in pytorch. During runtime if +// input scale value changes then we requantize bias with the updated scale. For +// inference we expect the graph to be static so the input scale should not +// change across consecutive inference calls. +struct PackedLinearWeightsQnnp : public LinearPackedParamsBase { + PackedLinearWeightsQnnp( + std::unique_ptr w, + at::Tensor orig_weight, + at::Tensor bias, + c10::optional input_scale, + at::Tensor w_scales, + std::vector&& w_zps) + : w(std::move(w)), + orig_weight(std::move(orig_weight)), + bias_(at::native::mobile::allocate_padded_contiguous_if_needed( + bias, bias.suggest_memory_format())), + per_channel_(this->orig_weight.qscheme() == at::kPerChannelAffine), + input_scale(std::move(input_scale)), + w_scales(std::move(w_scales)), + w_zero_points(std::move(w_zps)), + q_scheme(this->orig_weight.qscheme()) { + weight_sizes = this->orig_weight.sizes().vec(); + } + + std::unique_ptr w; + at::Tensor orig_weight; + at::Tensor bias_; + bool per_channel_; + c10::optional input_scale; + at::Tensor w_scales; + std::vector w_zero_points; + std::vector requantization_scales; + std::vector weight_sizes; + c10::QScheme q_scheme; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override; + + std::tuple> unpack() override; + + c10::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + c10::optional bias); + + bool per_channel() const { + return per_channel_; + } + + private: + std::mutex qnnp_mutex_; + +#ifdef USE_XNNPACK + xnnpack_operator xnnp_linear_op; + + template + at::Tensor apply_impl_xnnp( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +#endif // USE_XNNPACK + + template + at::Tensor apply_impl( + at::Tensor input, + double output_scale, + int64_t output_zero_point); + + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range); +}; + +template +struct PackedConvWeightsQnnp : public ConvPackedParamsBase { + PackedConvWeightsQnnp( + std::unique_ptr w, + at::Tensor orig_weight, + at::Tensor bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose, + c10::optional input_scale, + std::vector kernel, + at::Tensor w_scale, + std::vector&& w_zps, + bool is_per_channel) + : w(std::move(w)), + orig_weight(std::move(orig_weight)), + bias(std::move(bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose), + is_per_channel_(is_per_channel), + input_scale(input_scale), + kernel_(std::move(kernel)), + w_scales(std::move(w_scale)), + w_zero_points(std::move(w_zps)) { + const bool any_padding = std::any_of( + padding_.begin(), padding_.end(), [](const auto& e) { return e != 0; }); + const size_t kernel_size = + std::accumulate(kernel_.begin(), kernel_.end(), 1, std::multiplies<>()); + + const size_t group_input_channels = transpose + ? this->orig_weight.size(0) / groups + : this->orig_weight.size(1); + const size_t group_output_channels = transpose + ? this->orig_weight.size(1) + : this->orig_weight.size(0) / groups; + + const size_t kernel_depth = kSpatialDim == 3 ? kernel_[0] : 1; + const size_t kernel_height = kernel_[kSpatialDim - 2]; + const size_t kernel_width = kernel_[kSpatialDim - 1]; + + pytorch_qnnp_ukernel_type ukernel_type; + if (transpose_) { + ukernel_type = pytorch_qnnp_ukernel_type_conv; + } else { + ukernel_type = pytorch_qnnp_ukernel_type_none; + + const bool has_depthwise_dimensions = + (kSpatialDim == 2 && + ((kernel_height == 3 && kernel_width == 3) || + (kernel_height == 5 && kernel_width == 5))) || + (kSpatialDim == 3 && kernel_height == 3 && kernel_width == 3 && + kernel_depth == 3); + const bool has_depthwise_grouping = + group_input_channels == 1 && group_output_channels == 1 && groups > 1; + + if (has_depthwise_dimensions && has_depthwise_grouping) { + ukernel_type = pytorch_qnnp_ukernel_type_dwconv; + } else if ( + kernel_size == 1 && + std::all_of( + stride_.begin(), + stride_.end(), + [](const auto& e) { return e == 1; }) && + !any_padding) { + ukernel_type = group_input_channels >= SIZE_MAX + ? pytorch_qnnp_ukernel_type_xzp_gemm + : pytorch_qnnp_ukernel_type_gemm; + } else { + ukernel_type = pytorch_qnnp_ukernel_type_conv; + } + } + + if (is_per_channel && ukernel_type == pytorch_qnnp_ukernel_type_xzp_gemm) { + TORCH_INTERNAL_ASSERT( + false, "Per channel quantized weights are not supported for XZP kernels"); + } + + pytorch_qnnp_operator_t convolution{nullptr}; + // Initially all the params are set to zero. + convolution = static_cast( + calloc(1, sizeof(struct pytorch_qnnp_operator))); + if (convolution == nullptr) { + TORCH_INTERNAL_ASSERT( + false, "failed to allocate %zu bytes for pytorch_qnnp_operator structure", + sizeof(struct pytorch_qnnp_operator)); + } + + convolution_op = + std::unique_ptr( + convolution); + + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + convolution->ukernel_type = ukernel_type; + convolution->groups = groups; + convolution->group_input_channels = group_input_channels; + convolution->group_output_channels = group_output_channels; + convolution->kernel_depth = kernel_depth; + convolution->kernel_height = kernel_height; + convolution->kernel_width = kernel_width; + convolution->stride_depth = kSpatialDim == 3 ? stride_[0] : 1; + convolution->stride_height = stride_[kSpatialDim - 2]; + convolution->stride_width = stride_[kSpatialDim - 1]; + convolution->dilation_depth = kSpatialDim == 3 ? dilation_[0] : 1; + convolution->dilation_height = dilation_[kSpatialDim - 2]; + convolution->dilation_width = dilation_[kSpatialDim - 1]; + convolution->input_padding_height = padding_[kSpatialDim - 2]; + convolution->input_padding_width = padding_[kSpatialDim - 1]; + convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0; + convolution->per_channel = is_per_channel_; + convolution->transpose = transpose_; + + const uint32_t kr = pytorch_qnnp_params.q8conv.kr; + const size_t k_stride = (group_input_channels + (kr - 1)) & -kr; + + size_t zero_size = sizeof(uint8_t) * k_stride; + size_t zero_offset = 0; + + if (transpose_) { + convolution->adjustment_width = output_padding_[1]; + convolution->adjustment_height = output_padding_[0]; + if (group_input_channels < 8) { + zero_size += 8; + zero_offset = 8; + } + } else { + zero_buffer_size = 0; + if (any_padding) { + zero_size = 0; + zero_offset = 0; + if (ukernel_type == pytorch_qnnp_ukernel_type_dwconv) { + const uint32_t cr = pytorch_qnnp_params.q8dw9.cr; + const size_t group_stride = (groups + (cr - 1)) & -cr; + if (groups >= 8) { + zero_size = sizeof(uint8_t) * group_stride; + zero_offset = 0; + } else { + zero_size = sizeof(uint8_t) * group_stride + 8; + zero_offset = sizeof(uint8_t) * 8; + } + } else if ( + ukernel_type == pytorch_qnnp_ukernel_type_conv || + ukernel_type == pytorch_qnnp_ukernel_type_gemm) { + if (group_input_channels >= 8) { + zero_size = sizeof(uint8_t) * k_stride; + zero_offset = 0; + } else { + zero_size = sizeof(uint8_t) * k_stride + 8; + zero_offset = 8; + } + } + } + } + + // NOLINTNEXTLINE(clang-analyzer-optin.portability.UnixAPI) + void* zero_buffer = malloc(zero_size); + if (zero_buffer == nullptr) { + pytorch_qnnp_delete_operator(convolution); + TORCH_INTERNAL_ASSERT( + false, "failed to allocate %zu bytes for zero padding", + zero_size); + } + // Need to set to input zero point + // memset(zero_buffer, input_zero_point, zero_size); + zero_buffer_size = zero_size; + convolution->zero_buffer = zero_buffer; + convolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset); + } + + std::unique_ptr convolution_op; + #ifdef USE_XNNPACK + xnnpack_operator xnnp_convolution_op; + #endif // USE_XNNPACK + std::unique_ptr w; + at::Tensor orig_weight; + at::Tensor bias; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + bool transpose_; + bool is_per_channel_; + c10::optional input_scale; + std::vector kernel_; + at::Tensor w_scales; + std::vector w_zero_points; + std::vector requantization_scales; + size_t zero_buffer_size; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range=false) override; + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + c10::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return transpose_; + } + + bool per_channel() const { + return is_per_channel_; + } + + private: + std::mutex qnnp_mutex_; + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); + +#ifdef USE_XNNPACK + template + at::Tensor apply_impl_xnnp( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +#endif // USE_XNNPACK +}; + +enum class Activation : uint8_t { NONE = 0, RELU = 1 }; + +#if defined(__ANDROID__) && !defined(__NDK_MAJOR__) +template +inline float Round(const float x) { + return ::nearbyintf(x); +} +inline double Round(const double x) { + return ::nearbyint(x); +} +#else +template +inline T Round(const T x) { + return std::nearbyint(x); +} +#endif + +template +inline T QuantizeValue(float scale, int32_t zero_point, float value) { + const int32_t qmin = std::numeric_limits::min(); + const int32_t qmax = std::numeric_limits::max(); + auto r = zero_point + static_cast(Round(value / scale)); + r = std::max(r, qmin); + r = std::min(r, qmax); + return static_cast(r); +} + +template +inline std::pair activationLimits( + float scale, + int32_t zero_point, + Activation Ac) { + switch (Ac) { + case Activation::NONE: + return {std::numeric_limits::min(), + std::numeric_limits::max()}; + case Activation::RELU: + return {QuantizeValue(scale, zero_point, 0.0), + std::numeric_limits::max()}; + default: +#ifdef _MSC_VER + __assume(0); +#else + __builtin_unreachable(); +#endif + } +} + +namespace at { +namespace native { +namespace qnnp_avgpool_helper { +Tensor qnnpack_avg_pool2d( + Tensor input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + bool ceil_mode, + bool count_include_pad, + c10::optional divisor_override); +} // qnnp_avgpool_helper +} // namespace native +} // namespace at + +namespace { +C10_UNUSED std::vector generate_requantization_scales( + const at::Tensor& weight_scales, + const float input_scale, + const float output_scale, + std::vector& requant_scales) { + // Since weight scale is allocated with padding + // weight_scales.numel() gives us padded num elements. + const auto num_output_channels_padded = weight_scales.numel(); + float *const weight_scales_data = weight_scales.data_ptr(); + if (static_cast(requant_scales.size()) < num_output_channels_padded) { + requant_scales.resize(num_output_channels_padded); + } + for (const auto i : c10::irange(num_output_channels_padded)) { + const auto inverse_output_scale = 1.f /output_scale; + requant_scales[i] = (weight_scales_data[i] * input_scale) * inverse_output_scale; + TORCH_CHECK( + (requant_scales[i] > 0.0f && std::isnormal(requant_scales[i])), + "failed to create op with requantization scale: ", + requant_scales[i], + ": requantization scale must be finite and positive"); + } + return requant_scales; +} + +C10_UNUSED std::pair, at::Tensor> make_zero_points_and_scales_tensor( + const at::Tensor& weight_contig, + bool transpose = false, + uint32_t groups = 1 + ) { + const int out_ch_idx = transpose ? 1 : 0; + const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1); + // Add 8 to account for bufferring needed by QNNPACK. + const auto num_output_channels_padded = num_output_channels + kPaddingChannels; + const auto qtype = weight_contig.qscheme(); + std::vector weight_zp(num_output_channels_padded, 0); + // Adjust weight zero point, similar to weight data. + if (qtype == at::kPerTensorAffine) { + for (const auto i : c10::irange(num_output_channels)) { + weight_zp[i] = (uint8_t)(weight_contig.q_zero_point() + 128); + } + } else if (qtype == at::kPerChannelAffine) { + TORCH_CHECK( + weight_contig.q_per_channel_zero_points().scalar_type() == at::kLong, + "Per channel zero points dtype must be long int."); + const int64_t* per_channel_zero_points = + weight_contig.q_per_channel_zero_points().data_ptr(); + for (const auto i : c10::irange(num_output_channels)) { + weight_zp[i] = (uint8_t)(per_channel_zero_points[i] + 128); + } + } else { + TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme."); + } + at:: Tensor weight_scales = + at::empty( + {num_output_channels_padded}, + at::device(at::kCPU).dtype(at::kFloat)); + float *const weight_scales_data = weight_scales.data_ptr(); + if (qtype == at::kPerTensorAffine) { + for (const auto i : c10::irange(num_output_channels)) { + weight_scales_data[i] = weight_contig.q_scale(); + } + } else if (qtype == at::kPerChannelAffine) { + TORCH_CHECK( + weight_contig.q_per_channel_scales().scalar_type() == at::kDouble, + "Per channel scales dtype must be double."); + const double *const per_channel_scales = + weight_contig.q_per_channel_scales().data_ptr(); + for (const auto i : c10::irange(num_output_channels)) { + weight_scales_data[i] = static_cast(per_channel_scales[i]); + } + } else { + TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme."); + } + for (const auto i : c10::irange(num_output_channels, num_output_channels_padded)) { + weight_scales_data[i] = 1.f; + } + return {weight_zp, weight_scales}; +} +} // namespace + +#endif diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..0b026c739786a0b68ccf779f2724c1c4607998e1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h @@ -0,0 +1,239 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + +namespace quant_utils { +namespace { + float RawUint16ToFp16(unsigned short value) { + // Convert raw 16 bits half precision floating point number + // to single precision floating point number. + const unsigned short sign_bits = value >> 15; + const unsigned short exponent_bits = value >> 10 & 0x1f; + const unsigned short significand_bits = value & 0x3ff; + + const float sign = sign_bits ? -1 : 1; + const float significand = + 1 + significand_bits * 0.0009765625f; // 0.0009765625f = 0x1p-10 = 2^-10; + const float exponent = exponent_bits - 0xf; + + return sign * std::ldexp(significand, exponent); +} + +template +bool CheckAndSaturate(T max_val, T* element) { + if (*element > max_val) { + *element = max_val; + return true; + } + if (*element < -max_val) { + *element = -max_val; + return true; + } + return false; +} +} +using namespace std; +// A structure to hold quantization parameters 'scale' and 'zero_point'. +// The meaning of these values is as the constants in the quantization equation +// +// real_value = scale * (quantized_value - zero_point) +// +// In other words, 'zero_point' is the quantized value that corresponds +// to the real value 0, and 'scale' is the difference of real values +// corresponding to consecutive quantized values. +struct TensorQuantizationParams { + double scale; + std::int32_t zero_point; + int precision; +}; + +// Use fp16_min as the small scale cutoff because we don't want to use scales in +// fp16 subnormal range. This is to be consistent with Glow and FakeLowP +// implementation for NNPI. +constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f; + +// Following implementation should be identical to fbgemm::ChooseQuantizationParams +inline TensorQuantizationParams ChooseQuantizationParams( + float min, + float max, + int32_t qmin, + int32_t qmax, + bool preserve_sparsity = false, + bool force_scale_power_of_two = false, + bool reduce_range = false) { + TORCH_CHECK( + min <= max, + "In ChooseQuantizationParams, min should be less than or equal to max"); + + if (reduce_range) { + qmin = qmin/2; + qmax = qmax/2; + } + if (min < 0 && max > 0 && preserve_sparsity) { + int symmetric_qmin = -((qmax - qmin) / 2 + 1); + int symmetric_qmax = (qmax - qmin) / 2; + double max_scale = + std::max(fabs(min / symmetric_qmin), fabs(max / symmetric_qmax)); + min = max_scale * symmetric_qmin; + max = max_scale * symmetric_qmax; + } + + // We extend the [min, max] interval to ensure that it contains 0. + // Otherwise, we would not meet the requirement that 0 be an exactly + // representable value. + min = std::min(min, 0.f); + max = std::max(max, 0.f); + + TORCH_CHECK( + qmin < qmax, + "In ChooseQuantizationParams, qmin should be less than qmax"); + + // Use double precision for intermediate computation but use single precision + // in final number to reflect the actual number used during quantization. + double scale = (static_cast(max) - min) / (qmax - qmin); + // If scale is 0 or too small so its reciprocal is infinity, we arbitrary + // adjust the scale to 0.1 . We want to avoid scale's reciprocal being + // infinity because some of fbgemm code pre-computes scale's reciprocal to do + // multiplication instead of division in the time critical part of code. + if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) { + scale = 0.1; + } + TORCH_CHECK(scale > 0, "quantization scale should be > 0"); + + if (force_scale_power_of_two) { + if (scale < 1) { + scale = 1.0 / (1 << static_cast(floor(log(1.0 / scale) / log(2)))); + } else { + scale = 1 << static_cast(ceil(log(scale) / log(2))); + } + } + + // Cut off small scale + if (scale < SMALL_SCALE_THRESHOLD) { + float org_scale = scale; + scale = SMALL_SCALE_THRESHOLD; + // Adjust the min and max based on the new scale + if (min == 0.0f) { + max = SMALL_SCALE_THRESHOLD * (qmax - qmin); + } else if (max == 0.0f) { + min = -SMALL_SCALE_THRESHOLD * (qmax - qmin); + } else { + float amplifier = SMALL_SCALE_THRESHOLD / org_scale; + min *= amplifier; + max *= amplifier; + } + } + + // Zero-point computation. + // First the initial floating-point computation. The zero-point can be + // determined from solving an affine equation for any known pair + // (real value, corresponding quantized value). + // We know two such pairs: (rmin, qmin) and (rmax, qmax). + // The arithmetic error on the zero point computed from either pair + // will be roughly machine_epsilon * (sum of absolute values of terms) + // so we want to use the variant that adds the smaller terms. + double zero_point_from_min = qmin - min / static_cast(scale); + double zero_point_from_max = qmax - max / static_cast(scale); + double zero_point_from_min_error = + std::abs(qmin) - std::abs(min / static_cast(scale)); + double zero_point_from_max_error = + std::abs(qmax) - std::abs(max / static_cast(scale)); + double initial_zero_point = + zero_point_from_min_error < zero_point_from_max_error + ? zero_point_from_min + : zero_point_from_max; + + // for symmetric quantization (preserve_sparsity == true), we force zero_point + // to be a middle value between qmin and qmax. + // If either min or max is 0, then we just use 0 as zero_point. + if (min < 0 && max > 0 && preserve_sparsity) { + initial_zero_point = static_cast(qmin + qmax) / 2; + } + + // Now we need to nudge the zero point to be an integer + // (our zero points are integer, and this is motivated by the requirement + // to be able to represent the real value "0" exactly as a quantized value, + // which is required in multiple places, for example in Im2col with zero + // padding). + int32_t nudged_zero_point = 0; + if (initial_zero_point < qmin) { + nudged_zero_point = qmin; + } else if (initial_zero_point > qmax) { + nudged_zero_point = qmax; + } else { + nudged_zero_point = nearbyint(initial_zero_point); + } + + TensorQuantizationParams result; + result.scale = scale; + result.zero_point = nudged_zero_point; + return result; +} + +// This function helps to convert the Conv1D dimensions usable by the Conv2d op. +constexpr int64_t kConv1dSqueezeDim = 0; +static C10_UNUSED torch::List MakeArgForConv1d(const torch::List& arg, + int64_t base_value) { + TORCH_CHECK(!arg.empty(), "Argument must have elements."); + torch::List result({arg.get(0), base_value}); + if (arg.size() == 1) { + result[1] = arg.get(0); + } else { + result[1] = arg.get(1); + } + result[kConv1dSqueezeDim] = base_value; + return result; +} + +// The range for using FP16 quantization of weights requires that the elements +// should be in the range of [5.96e-8, 65504]. If it is out of range, then the +// number will be saturated to max or min representable values by FP16. +inline void HandleWeightsSaturation(int64_t N, float* weight) { + const float kFp16Max = RawUint16ToFp16(0x7BFF); + bool found_out_of_range = false; + for (const auto i : c10::irange(N)) { + bool saturate = CheckAndSaturate(kFp16Max, weight + i); + if (saturate) { + found_out_of_range = true; + } + } + if (found_out_of_range) { + TORCH_WARN("FOUND weight out of range "); + } +} + +// Util function for quantizing bias. +inline at::Tensor QuantizeBias( + bool is_per_channel, + const at::Tensor& bias, + const at::Tensor& weight_contig, + double input_scale) { + at::Tensor qbias; + if (is_per_channel) { + auto bias_quant_scales = + weight_contig.q_per_channel_scales() * input_scale; + auto bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt); + qbias = at::native::quantize_per_channel( + bias, bias_quant_scales, bias_zp, 0, c10::kQInt32); + } else { + qbias = at::native::quantize_per_tensor( + bias, weight_contig.q_scale() * input_scale, 0, c10::kQInt32); + } + return qbias; +} + +} // namespace quant_utils diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h new file mode 100644 index 0000000000000000000000000000000000000000..3ef8a3f4f4f4215e2e18005341ee9e35c1f78703 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h @@ -0,0 +1,258 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +using qrelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qrelu_leaky_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/, + const Scalar& /*negval_*/); +using qgelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, GeluType /* approximate */); +using qsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, double output_scale, int64_t output_zero_point); +using qhardsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qclamp_fn = void (*)( + const at::Tensor& /*qx*/, + const Scalar& min, + const Scalar& max, + at::Tensor& /*qy*/); +using qclamp_minmax_fn = void (*)( + const at::Tensor& /*qx*/, + const Scalar& /*min or max*/, + at::Tensor& /*qy*/); +using qthreshold_fn = void (*)( + const at::Tensor& /*qx*/, + const Scalar& threshold, + const Scalar& value, + at::Tensor& /*qy*/); +using qtanh_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qelu_fn = void(*)( + const at::Tensor& /*qx*/, + const Scalar& /*alpha*/, + const Scalar& /*scale*/, + const Scalar& /*input_scale*/, + at::Tensor& /*qy*/); +using qbinary_fn = + void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Tensor& /*other*/); +using qadd_scalar_fn = + void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Scalar& other /*other*/); +using qhardswish_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qdropout_fn = void(*)( + const at::Tensor& /*qx*/, + const Scalar& /*p*/, + bool training /*training*/, + at::Tensor& /*qy*/); +using qmaxpool_2d_fn = void (*)( + const Tensor& qx, + int64_t iC, // input/output channels + int64_t iH, + int64_t iW, // input sizes + int64_t oH, + int64_t oW, // output sizes + int64_t kH, + int64_t kW, // kernel size + int64_t sH, + int64_t sW, // strides + int64_t pH, + int64_t pW, // padding + int64_t dH, + int64_t dW, // dilation + Tensor& qy); +using qmaxpool_3d_fn = void (*)( + const Tensor& qx, + int64_t iC, // input/output channels + int64_t iT, + int64_t iH, + int64_t iW, // input sizes + int64_t oT, + int64_t oH, + int64_t oW, // output sizes + int64_t kT, + int64_t kH, + int64_t kW, // kernel size + int64_t sT, + int64_t sH, + int64_t sW, // strides + int64_t pT, + int64_t pH, + int64_t pW, // padding + int64_t dT, + int64_t dH, + int64_t dW, // dilation + Tensor& qy); +using qadaptive_avg_pool2d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t sizeB, + int64_t sizeC, + int64_t isizeH, + int64_t isizeW, + int64_t osizeH, + int64_t osizeW, + int64_t istrideB, + int64_t istrideC, + int64_t istrideH, + int64_t istrideW); +using qadaptive_avg_pool3d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t sizeB, + int64_t sizeC, + int64_t isizeD, + int64_t isizeH, + int64_t isizeW, + int64_t osizeD, + int64_t osizeH, + int64_t osizeW, + int64_t istrideB, + int64_t istrideC, + int64_t istrideD, + int64_t istrideH, + int64_t istrideW); +using qavg_pool2d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t nBatch, + int64_t nInputPlane, + int64_t inputWidth, + int64_t inputHeight, + int64_t outputWidth, + int64_t outputHeight, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + bool count_include_pad, + c10::optional divisor_override); + +using qavg_pool3d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t nBatch, + int64_t nInputPlane, + int64_t inputWidth, + int64_t inputHeight, + int64_t inputDepth, + int64_t outputWidth, + int64_t outputHeight, + int64_t outputDepth, + int kW, + int kH, + int kD, + int dW, + int dH, + int dD, + int padW, + int padH, + int padD, + bool count_include_pad, + c10::optional divisor_override); + +using qupsample_bilinear2d_fn = void (*)( + Tensor& output, + const Tensor& input, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width, + int64_t nbatch, + int64_t channels, + bool align_corners, + c10::optional scales_h, + c10::optional scales_w); + +using qcat_nhwc_fn = Tensor (*)( + const MaterializedITensorListRef& qxs, + int64_t dim, + double scale, + int64_t zero_point); +using qtopk_fn = void(*)(Tensor&, Tensor&, const Tensor&, int64_t, int64_t, bool, bool); + +using qbatch_norm_fn = void(*)(int64_t, int64_t, int64_t, int64_t, int64_t, const Tensor&, const Tensor&, const Tensor&, Tensor&); + +using qnormalize_fn = void (*)( + const Tensor& /* X */, + const Tensor& /* gamma */, + const Tensor& /* beta */, + bool /* affine_per_channel */, + int /* num_channels */, + int /* num_groups */, + int64_t /* M */, + int64_t /* N */, + double /* eps */, + Tensor* /* Y */); + +using qmean_inner_dim_fn = void (*)( + const Tensor& /* X */, + OptionalIntArrayRef /* opt_dim */, + bool /* keepdim */, + c10::optional /* opt_dtype */, + Tensor& /* Y */); + +using qstd_inner_dim_fn = void (*)( + const Tensor& /* X */, + OptionalIntArrayRef /* dim */, + const c10::optional& /* correction */, + bool /* keepdim */, + Tensor& /* Y */); + +using qnormalize_nhwc_fn = void (*)( + const Tensor& /* X */, + const Tensor& /* gamma */, + const Tensor& /* beta */, + bool /* affine_per_channel */, + int /* num_channels */, + int /* num_groups */, + int64_t /* M */, + int64_t /* N */, + double /* eps */, + Tensor* /* Y */); + +using qprelu_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/, + const Tensor& /*qw*/); + +DECLARE_DISPATCH(qadaptive_avg_pool2d_fn, qadaptive_avg_pool2d_nhwc_stub); +DECLARE_DISPATCH(qadaptive_avg_pool3d_fn, qadaptive_avg_pool3d_ndhwc_stub); +DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_relu_stub); +DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_stub); +DECLARE_DISPATCH(qavg_pool2d_fn, qavg_pool2d_nhwc_stub); +DECLARE_DISPATCH(qavg_pool3d_fn, qavg_pool3d_nhwc_stub); +DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_relu_stub); +DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_stub); +DECLARE_DISPATCH(qbinary_fn, qadd_relu_stub); +DECLARE_DISPATCH(qbinary_fn, qadd_stub); +DECLARE_DISPATCH(qbinary_fn, qmul_relu_stub); +DECLARE_DISPATCH(qbinary_fn, qmul_stub); +DECLARE_DISPATCH(qcat_nhwc_fn, qcat_nhwc_stub); +DECLARE_DISPATCH(qcat_nhwc_fn, qcat_relu_nhwc_stub); +DECLARE_DISPATCH(qclamp_fn, qclamp_stub); +DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_min_stub); +DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_max_stub); +DECLARE_DISPATCH(qelu_fn, qelu_stub); +DECLARE_DISPATCH(qhardsigmoid_fn, qhardsigmoid_stub); +DECLARE_DISPATCH(qhardswish_fn, qhardswish_stub); +DECLARE_DISPATCH(qdropout_fn, qdropout_stub); +DECLARE_DISPATCH(qmaxpool_2d_fn, qmaxpool_2d_nhwc_stub); +DECLARE_DISPATCH(qmaxpool_3d_fn, qmaxpool_3d_nthwc_stub); +DECLARE_DISPATCH(qnormalize_fn, quantized_normalize_stub); +DECLARE_DISPATCH(qnormalize_nhwc_fn, quantized_groupnorm_nhwc_stub); +DECLARE_DISPATCH(qrelu_fn, qrelu_stub); +DECLARE_DISPATCH(qrelu_leaky_fn, qrelu_leaky_stub); +DECLARE_DISPATCH(qgelu_fn, qgelu_stub); +DECLARE_DISPATCH(qsigmoid_fn, qsigmoid_stub); +DECLARE_DISPATCH(qtanh_fn, qtanh_stub); +DECLARE_DISPATCH(qthreshold_fn, qthreshold_stub); +DECLARE_DISPATCH(qtopk_fn, qtopk_stub); +DECLARE_DISPATCH(qupsample_bilinear2d_fn, qupsample_bilinear2d_nhwc_stub); +DECLARE_DISPATCH(qmean_inner_dim_fn, qmean_inner_dim_stub); +DECLARE_DISPATCH(qstd_inner_dim_fn, qstd_inner_dim_stub); +DECLARE_DISPATCH(qprelu_fn, qprelu_stub); + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h new file mode 100644 index 0000000000000000000000000000000000000000..9f452a1cc72137beedbbdc0273b7ce87addb4740 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h @@ -0,0 +1,414 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#if !defined(__s390x__) && !defined(__powerpc__) +#include +#endif + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + + +#include + +/* Convolution prepacked parameters serialization. + * + * Version 1 + * + * - Fields: + * 1. weight + * 2. bias + * 3. stride x kSpatialDim + * 4. padding x kSpatialDim + * 5. dilation x kSpatialDim + * 6. groups + * + * Version 2 + * + * - Fields: + * 0. version (string) + * 1. list of non-optional tensors + * 0: packed parameters (int16_t) + * - kSpatialDim + * - stride x kSpatialDim + * - padding x kSpatialDim + * - dilation x kSpatialDim + * - output_padding x kSpatialDim + * - groups + * - transpose (0 or 1) + * 1: weight + * 2. list of optional tensors + * 0: bias + * + * Version 3 + * + * - Fields: + * 0. version (int64_t) + * 1. list of int64_t configuration values + * - kSpatialDim + * - stride x kSpatialDim + * - padding x kSpatialDim + * - dilation x kSpatialDim + * - output_padding x kSpatialDim + * - groups + * - flags (bitmask) + * - (1 << 0) transpose (1 = yes) + * 2. list of optional tensors + * 0: None (helps with type inference) + * 1: weight (this must be present) + * 2: bias + */ + +using ConvParamsSerializationTypeV2 = std::tuple< + // version, for versions 2 and up + std::string, + // non-optional tensors + std::vector, + // optional tensors + std::vector>>; + +using ConvParamsSerializationTypeV3 = std::tuple< + // version, int for versions 3 and up + int64_t, + // configuration values + std::vector, + // optional tensors + std::vector>>; + +// Parses any historical conv packed params format into +// the current format. +template +ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) { + + // determine the version based on IValue contents + int version = -1; + if (v.isTuple()) { + const auto& elements = v.toTupleRef().elements(); + if (!elements.empty()) { + auto firstElement = elements[0]; + if (firstElement.isTensor()) { + version = 1; + } else if (firstElement.isString()) { + const std::string& version_str = firstElement.toStringRef(); + // note: not parsing the string to automatically handle bad + // inputs + if (version_str == "2") { + version = 2; + } + } else if (firstElement.isInt()) { + auto raw_version = firstElement.toInt(); + if (raw_version == 3) { + version = 3; + } + } + } + } + TORCH_INTERNAL_ASSERT(version != -1, "Unable to parse serialization version"); + + if (version == 1) { + // version 1 - convert to version 3 manually + + const auto& elements = v.toTupleRef().elements(); + + at::Tensor weight = elements[0].toTensor(); + c10::optional bias = elements[1].toOptional(); + torch::List stride_x_kSpatialDim = elements[2].toTensorList(); + torch::List padding_x_kSpatialDim = elements[3].toTensorList(); + torch::List dilation_x_kSpatialDim = elements[4].toTensorList(); + at::Tensor groups = elements[5].toTensor(); + + std::vector config_vals; + config_vals.reserve( + stride_x_kSpatialDim.size() + padding_x_kSpatialDim.size() + + dilation_x_kSpatialDim.size() + kSpatialDim + 3); + config_vals.push_back(kSpatialDim); + for (const auto i : c10::irange(stride_x_kSpatialDim.size())) { + auto stride = stride_x_kSpatialDim.get(i); + config_vals.push_back(stride[0].item()); + } + for (const auto i : c10::irange(padding_x_kSpatialDim.size())) { + auto padding = padding_x_kSpatialDim.get(i); + config_vals.push_back(padding[0].item()); + } + for (const auto i : c10::irange(dilation_x_kSpatialDim.size())) { + auto dilation = dilation_x_kSpatialDim.get(i); + config_vals.push_back(dilation[0].item()); + } + // output_padding does not exist in v1, so we fill in a default value + for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) { + config_vals.push_back(0); + } + config_vals.push_back(groups[0].item()); + // transpose does not exist in v1, so we fill in a default value + config_vals.push_back(0); + + std::vector> tensors; + tensors.emplace_back(); + tensors.emplace_back(weight); + tensors.emplace_back(bias); + + int64_t version = 3; + return std::tie(version, config_vals, tensors); + } else if (version == 2) { + // version 2 + const auto& elements = v.toTupleRef().elements(); + std::vector non_optional = elements[1].toTensorList().vec(); + std::vector> optional; + + if (elements[2].isTensorList()) { + for (const auto& elem : elements[2].toTensorList()) { + optional.emplace_back(static_cast(elem)); + } + } else { + for (const auto& elem : elements[2].toList()) { + optional.emplace_back(static_cast(elem).toOptional()); + } + } + // create default optional value for bias + if (optional.empty()) { + optional.emplace_back(); + } + + auto config_a = non_optional[0].accessor(); + std::vector config_vals; + config_vals.reserve(config_a.size(0)); + for (const auto i : c10::irange(config_a.size(0))) { + config_vals.emplace_back(config_a[i]); + } + + auto weight = non_optional[1]; + auto bias = optional[0]; + + std::vector> tensors; + tensors.emplace_back(); + tensors.emplace_back(weight); + tensors.emplace_back(bias); + + int64_t version = 3; + return std::tie(version, config_vals, tensors); + } else if (version == 3) { + return v.to(); + } else { + TORCH_INTERNAL_ASSERT(false, "Unexpected serialized qconv version: ", + version); + } +} + +#define QCONV_SERIALIZATION_VERSION 2 + +#if QCONV_SERIALIZATION_VERSION == 2 +using ConvParamsSerializationType = ConvParamsSerializationTypeV2; + +template +ConvParamsSerializationTypeV2 serialize_conv( + const c10::intrusive_ptr>& params) { + + std::string version = "2"; + std::vector non_optional; + std::vector> optional; + + // create a packed int8_t tensor for conv params + std::vector params_vec; + params_vec.push_back(kSpatialDim); + auto stride = params->stride().vec(); + params_vec.insert(params_vec.end(), stride.begin(), stride.end()); + auto padding = params->padding().vec(); + params_vec.insert(params_vec.end(), padding.begin(), padding.end()); + auto dilation = params->dilation().vec(); + params_vec.insert(params_vec.end(), dilation.begin(), dilation.end()); + auto output_padding = params->output_padding().vec(); + params_vec.insert(params_vec.end(), output_padding.begin(), + output_padding.end()); + params_vec.push_back(params->groups()); + params_vec.push_back(params->transpose()); + int64_t vec_size = params_vec.size(); + at::Tensor params_tensor = at::from_blob( + params_vec.data(), {vec_size}, + at::TensorOptions().dtype(at::kShort)) + // clone to retain ownership of the data + .clone(); + + auto [weight, bias] = params->unpack(); + + non_optional.emplace_back(std::move(params_tensor)); + non_optional.emplace_back(std::move(weight)); + optional.emplace_back(std::move(bias)); + + return std::tie(version, non_optional, optional); +} + +#elif QCONV_SERIALIZATION_VERSION == 3 +using ConvParamsSerializationType = ConvParamsSerializationTypeV3; + +template +ConvParamsSerializationTypeV3 serialize_conv( + const c10::intrusive_ptr>& params) { + std::vector config_vals; + config_vals.push_back(kSpatialDim); + auto stride = params->stride().vec(); + config_vals.insert(config_vals.end(), stride.begin(), stride.end()); + auto padding = params->padding().vec(); + config_vals.insert(config_vals.end(), padding.begin(), padding.end()); + auto dilation = params->dilation().vec(); + config_vals.insert(config_vals.end(), dilation.begin(), dilation.end()); + auto output_padding = params->output_padding().vec(); + config_vals.insert(config_vals.end(), output_padding.begin(), + output_padding.end()); + config_vals.push_back(params->groups()); + config_vals.push_back(params->transpose()); + + auto [weight, bias] = params->unpack(); + + std::vector> tensors; + tensors.emplace_back(); + tensors.emplace_back(weight); + tensors.emplace_back(bias); + + int64_t version = 3; + return std::tie(version, config_vals, tensors); +} + +#else +#error "Invalid qconv serialization version." +#endif + +template +c10::intrusive_ptr> deserialize_conv( + ConvParamsSerializationTypeV3 state) { + auto [version, config_vals, tensors] = state; + TORCH_INTERNAL_ASSERT(version == 3, "Unexpected serialized qconv version: ", version); + + TORCH_CHECK(tensors.size() == 3, "Wrong number of tensors", tensors.size()); + c10::optional weight = tensors[1]; + c10::optional bias = tensors[2]; + TORCH_INTERNAL_ASSERT(weight, "Weight should always be present in serialized qconv."); + + torch::List stride, padding, output_padding, dilation; + // skip kSpatialDim + int idx = 1; + for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) { + stride.emplace_back(config_vals.at(idx)); + idx++; + } + for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) { + padding.emplace_back(config_vals.at(idx)); + idx++; + } + for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) { + dilation.emplace_back(config_vals.at(idx)); + idx++; + } + for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) { + TORCH_INTERNAL_ASSERT(idx < static_cast(config_vals.size()), + "Unexpected index = ", idx, " for config_vals of size ", + config_vals.size()); + output_padding.emplace_back(config_vals.at(idx)); + idx++; + } + int64_t groups = config_vals.at(idx); + idx++; + int64_t flags = config_vals.at(idx); + idx++; + TORCH_INTERNAL_ASSERT(idx == static_cast(config_vals.size()), + "Unexpected length of config_vals, expected ", + idx, + " got ", + config_vals.size()); + + bool transpose = flags & (1 << 0); + + int64_t other_flags = flags & ~(1 << 0); + TORCH_INTERNAL_ASSERT(other_flags == 0, "Unexpected flags set in ", flags, "."); + + auto& ctx = at::globalContext(); + +#ifdef USE_FBGEMM + if (ctx.qEngine() == at::QEngine::X86) { +#if AT_MKLDNN_ENABLED() + bool use_onednn = onednn_utils::should_use_onednn_quant( + weight.value(), transpose, groups, output_padding); + if (use_onednn) { + return PackedConvWeightsOnednn::prepack( + weight.value(), + bias, + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif + return PackedConvWeight::prepack( + weight.value(), + bias, + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } // x86 +#endif + +#ifdef USE_FBGEMM + if (ctx.qEngine() == at::QEngine::FBGEMM) { + return PackedConvWeight::prepack( + weight.value(), + bias, + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif // USE_FBGEMM +#ifdef USE_PYTORCH_QNNPACK + if (ctx.qEngine() == at::QEngine::QNNPACK) { + TORCH_CHECK( + kSpatialDim == 2, + "prepack/__setstate__: QNNPACK only supports Conv2d " + "now."); + return PackedConvWeightsQnnp::prepack( + weight.value(), + bias, + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif // USE_PYTORCH_QNNPACK +#if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::ONEDNN) { + return PackedConvWeightsOnednn::prepack( + weight.value(), + bias, + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif // AT_MKLDNN_ENABLED() +TORCH_CHECK( + false, + "Didn't find engine for when deserializing ConvPackedParams: ", + toString(ctx.qEngine())); +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h new file mode 100644 index 0000000000000000000000000000000000000000..dbfb406ea55dbb50f97b1e86efb52c337af04847 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h @@ -0,0 +1,13 @@ +#pragma once + +#ifdef USE_PYTORCH_QNNPACK + +namespace at { +namespace native { + +void initQNNPACK(); + +} // namespace native +} // namespace at + +#endif diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h new file mode 100644 index 0000000000000000000000000000000000000000..86ed0f530f9c315410f8dbf2bb14d353cb92730c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h @@ -0,0 +1,34 @@ +#pragma once +#include +#include + +namespace at { +namespace native { +Tensor& embedding_bag_byte_rowwise_offsets_out( + Tensor& output, + const Tensor& weight, + const Tensor& indices, + const c10::optional& offsets_in, + const bool /* scale_grad_by_freq */, + const int64_t /* mode */, + bool pruned_weights, + const c10::optional& per_sample_weights_, + const c10::optional& compressed_indices_mapping, + bool include_last_offset); + +Tensor& embedding_bag_4bit_rowwise_offsets_out( + Tensor& output, + const Tensor& weight, + const Tensor& indices, + const c10::optional& offsets_in, + const bool /* scale_grad_by_freq */, + const int64_t /* mode */, + bool pruned_weights, + const c10::optional& per_sample_weights_, + const c10::optional& compressed_indices_mapping, + bool include_last_offset); + +Tensor& qembeddingbag_byte_unpack_out(Tensor& output, const Tensor& packed_weight); + +} // native +} // at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/verbose_wrapper.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/verbose_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..59d9682e345b4440e103a1f95c6da42208764aba --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/verbose_wrapper.h @@ -0,0 +1,8 @@ +#pragma once + +#include + +namespace torch::verbose { +TORCH_API int _mkl_set_verbose(int enable); +TORCH_API int _mkldnn_set_verbose(int level); +} // namespace torch::verbose diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sum_backward_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sum_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..57a5999e055083a5aebe599ac25fa68736d3e767 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sum_backward_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _sparse_sum_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_sum_backward") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor") + static at::Tensor call(const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim); +}; + +struct TORCH_API _sparse_sum_backward_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_sum_backward") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_sum_backward.out(Tensor grad, Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_copy_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_copy_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7f9dcd4e1ed0ddf5157fc17f67554ef0cf9a6d69 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_copy_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _test_autograd_multiple_dispatch_view_copy { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_test_autograd_multiple_dispatch_view_copy") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_test_autograd_multiple_dispatch_view_copy(Tensor self) -> Tensor") + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API _test_autograd_multiple_dispatch_view_copy_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_test_autograd_multiple_dispatch_view_copy") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_test_autograd_multiple_dispatch_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atan_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atan_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..6332bfdc50e0112e2184f7f02da88f1b15b2f284 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atan_meta_dispatch.h @@ -0,0 +1,26 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor atan(const at::Tensor & self); +TORCH_API at::Tensor & atan_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & atan_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & atan_(at::Tensor & self); + +} // namespace meta +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/geqrf_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/geqrf_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..87d3f521d623b05e3ae74ba8e93887ccd05a1831 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/geqrf_cuda_dispatch.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::tuple geqrf(const at::Tensor & self); +TORCH_API ::std::tuple geqrf_out(at::Tensor & a, at::Tensor & tau, const at::Tensor & self); +TORCH_API ::std::tuple geqrf_outf(const at::Tensor & self, at::Tensor & a, at::Tensor & tau); + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/i0_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/i0_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..036ec5cf8388536fa56c0f19ef6517cb5fdcc8d6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/i0_meta.h @@ -0,0 +1,27 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_i0 : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_pinned_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_pinned_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..66dee7e90ee1c980cbac2acb5f90325462a797cb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_pinned_ops.h @@ -0,0 +1,28 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API is_pinned { + using schema = bool (const at::Tensor &, c10::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::is_pinned") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "is_pinned(Tensor self, Device? device=None) -> bool") + static bool call(const at::Tensor & self, c10::optional device); + static bool redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional device); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/isfinite.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/isfinite.h new file mode 100644 index 0000000000000000000000000000000000000000..89d6e402b11a2fb577baafef460e252c3e3ae663 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/isfinite.h @@ -0,0 +1,30 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::isfinite(Tensor self) -> Tensor +inline at::Tensor isfinite(const at::Tensor & self) { + return at::_ops::isfinite::call(self); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_eigvals_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_eigvals_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e8eabbe79cd4e5198775c977d1e4ab92bfa0ea03 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_eigvals_cpu_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor & linalg_eigvals_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & linalg_eigvals_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cpu +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logical_or.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logical_or.h new file mode 100644 index 0000000000000000000000000000000000000000..71b55d18a23e49c239c8221343a32078b53b667b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logical_or.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::logical_or(Tensor self, Tensor other) -> Tensor +inline at::Tensor logical_or(const at::Tensor & self, const at::Tensor & other) { + return at::_ops::logical_or::call(self, other); +} + +// aten::logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logical_or_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) { + return at::_ops::logical_or_out::call(self, other, out); +} +// aten::logical_or.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & logical_or_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) { + return at::_ops::logical_or_out::call(self, other, out); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multinomial_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multinomial_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..37cfa9117a63e9854c5f25d040cc52704dd58237 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multinomial_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API multinomial_out { + using schema = at::Tensor & (const at::Tensor &, int64_t, bool, c10::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::multinomial") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & self, int64_t num_samples, bool replacement, c10::optional generator, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t num_samples, bool replacement, c10::optional generator, at::Tensor & out); +}; + +struct TORCH_API multinomial { + using schema = at::Tensor (const at::Tensor &, int64_t, bool, c10::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::multinomial") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor") + static at::Tensor call(const at::Tensor & self, int64_t num_samples, bool replacement, c10::optional generator); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t num_samples, bool replacement, c10::optional generator); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_dropout_backward_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_dropout_backward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4a0017b5ca6edbf03d09da17c8475060328e83c9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_dropout_backward_cpu_dispatch.h @@ -0,0 +1,23 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor native_dropout_backward(const at::Tensor & grad_output, const at::Tensor & mask, double scale); + +} // namespace cpu +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9051dffb2d89bcab3da645dc0314068df97695d6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_native.h @@ -0,0 +1,21 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor nested_to_padded_tensor(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size=c10::nullopt); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/normal_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/normal_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3f7d5a090449fd008bc2a70f3d3f26ad18ae683c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/normal_cuda_dispatch.h @@ -0,0 +1,32 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & normal_(at::Tensor & self, double mean=0, double std=1, c10::optional generator=c10::nullopt); +TORCH_API at::Tensor normal(const at::Tensor & mean, double std=1, c10::optional generator=c10::nullopt); +TORCH_API at::Tensor & normal_out(at::Tensor & out, const at::Tensor & mean, double std=1, c10::optional generator=c10::nullopt); +TORCH_API at::Tensor & normal_outf(const at::Tensor & mean, double std, c10::optional generator, at::Tensor & out); +TORCH_API at::Tensor normal(double mean, const at::Tensor & std, c10::optional generator=c10::nullopt); +TORCH_API at::Tensor & normal_out(at::Tensor & out, double mean, const at::Tensor & std, c10::optional generator=c10::nullopt); +TORCH_API at::Tensor & normal_outf(double mean, const at::Tensor & std, c10::optional generator, at::Tensor & out); +TORCH_API at::Tensor normal(const at::Tensor & mean, const at::Tensor & std, c10::optional generator=c10::nullopt); +TORCH_API at::Tensor & normal_out(at::Tensor & out, const at::Tensor & mean, const at::Tensor & std, c10::optional generator=c10::nullopt); +TORCH_API at::Tensor & normal_outf(const at::Tensor & mean, const at::Tensor & std, c10::optional generator, at::Tensor & out); + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3c6bcbfc97fb839cb850562c37df29668341c78c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cpu_dispatch.h @@ -0,0 +1,27 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor remainder(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & remainder_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & remainder_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & remainder_(at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor remainder(const at::Scalar & self, const at::Tensor & other); + +} // namespace cpu +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..9179e7e3e4599013de49c723c8838e9d9a72e85c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_native.h @@ -0,0 +1,23 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & select_copy_int_out_symint(const at::Tensor & self, int64_t dim, c10::SymInt index, at::Tensor & out); +TORCH_API at::Tensor select_copy_sparse_csr(const at::Tensor & self, int64_t dim, int64_t index); +TORCH_API at::Tensor select_copy_symint(const at::Tensor & self, int64_t dim, c10::SymInt index); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfcx_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfcx_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8797587a5c47da989771e0fd3dd15b2cc2337ecb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfcx_cuda_dispatch.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor special_erfcx(const at::Tensor & self); +TORCH_API at::Tensor & special_erfcx_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_erfcx_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..efcfe502b8cbaf3c5ac37dc26ad23694656ad573 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_cpu_dispatch.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor special_laguerre_polynomial_l(const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_laguerre_polynomial_l_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_laguerre_polynomial_l_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out); + +} // namespace cpu +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..83899990b92e9a8e14afc117c27961e0beae7a09 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices_cuda_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong); +TORCH_API at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset, c10::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory); + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trunc_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trunc_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a165c17de22fdf03517f50674497d7f52dcbd1f5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trunc_cpu_dispatch.h @@ -0,0 +1,26 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor trunc(const at::Tensor & self); +TORCH_API at::Tensor & trunc_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & trunc_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & trunc_(at::Tensor & self); + +} // namespace cpu +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trunc_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trunc_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..81f92307c4aead7aa1f466dd68962c87a2383794 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trunc_cuda_dispatch.h @@ -0,0 +1,26 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor trunc(const at::Tensor & self); +TORCH_API at::Tensor & trunc_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & trunc_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & trunc_(at::Tensor & self); + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_linear1d_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_linear1d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..e49e02330fe45680e654c4c6e28bf84d65e2294e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_linear1d_native.h @@ -0,0 +1,27 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +TORCH_API at::Tensor upsample_linear1d(const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, c10::optional> scale_factors); +struct TORCH_API structured_upsample_linear1d_out_cpu : public at::meta::structured_upsample_linear1d { +void impl(const at::Tensor & self, at::ArrayRef output_size, bool align_corners, c10::optional scales, const at::Tensor & out); +}; +struct TORCH_API structured_upsample_linear1d_out_cuda : public at::meta::structured_upsample_linear1d { +void impl(const at::Tensor & self, at::ArrayRef output_size, bool align_corners, c10::optional scales, const at::Tensor & out); +}; +} // namespace native +} // namespace at