diff --git a/.gitattributes b/.gitattributes index 9fd5cb3f4b31ed059a55aed8199713901de48e97..b076c7e468edac29543e48ad0e4021d0b441fca6 100644 --- a/.gitattributes +++ b/.gitattributes @@ -78,3 +78,5 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64.exe filter=lfs diff=lfs merge=lfs -text tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn.so.8 filter=lfs diff=lfs merge=lfs -text tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/distlib/t64-arm.exe filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/libcudart.so.11.0 filter=lfs diff=lfs merge=lfs -text +tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/cudagraph_trees.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/libcudart.so.11.0 b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/libcudart.so.11.0 new file mode 100644 index 0000000000000000000000000000000000000000..750042440a018d7d30fe670a719789b52b3e26c0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/libcudart.so.11.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0da41ae1323cf4eeb610123d69d7714124cfe5ebfcc4e45f02b910e51c57ee6 +size 679264 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/cudagraph_trees.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/cudagraph_trees.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0186b210b2b8de8074112c2e5f771a086023d2ec --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/__pycache__/cudagraph_trees.cpython-311.pyc @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f112096a5626f67e200c68699bf622cf45f14ef9d7136d8c68afda693609bcdb +size 106203 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h new file mode 100644 index 0000000000000000000000000000000000000000..970b7da5cb70931ccb450a6ec24d511f975248c6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/CompositeRandomAccessor.h @@ -0,0 +1,34 @@ +#pragma once + +#include + +namespace at::native { + +struct TupleInfoCPU { + template + using tuple = std::tuple; + + template + static constexpr auto tie(Types&... args) noexcept { + return std::tie(args...); + } +}; + +template +using CompositeRandomAccessorCPU = + CompositeRandomAccessor; + +template +void swap( + references_holder rh1, + references_holder rh2 +) { + return std::swap(rh1.data(), rh2.data()); +} + +template +auto get(references_holder rh) -> decltype(std::get(rh.data())) { + return std::get(rh.data()); +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DispatchStub.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DispatchStub.h new file mode 100644 index 0000000000000000000000000000000000000000..a7df275edf1de112a8002f835e0d85346cd67997 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/DispatchStub.h @@ -0,0 +1,315 @@ +#pragma once + +#include +#include + +#include +#include + +// Implements instruction set specific function dispatch. +// +// Kernels that may make use of specialized instruction sets (e.g. AVX2) are +// compiled multiple times with different compiler flags (e.g. -mavx2). A +// DispatchStub contains a table of function pointers for a kernel. At runtime, +// the fastest available kernel is chosen based on the features reported by +// cpuinfo. +// +// Example: +// +// In native/MyKernel.h: +// using fn_type = void(*)(const Tensor& x); +// DECLARE_DISPATCH(fn_type, stub); +// +// In native/MyKernel.cpp +// DEFINE_DISPATCH(stub); +// +// In native/cpu/MyKernel.cpp: +// namespace { +// // use anonymous namespace so that different cpu versions won't conflict +// void kernel(const Tensor& x) { ... } +// } +// REGISTER_DISPATCH(stub, &kernel); +// +// To call: +// stub(kCPU, tensor); +// +// TODO: CPU instruction set selection should be folded into whatever +// the main dispatch mechanism is. + +// ignore warnings about DispatchStub::DEFAULT, AVX, AVX2 defined elsewhere +C10_CLANG_DIAGNOSTIC_PUSH() +C10_CLANG_DIAGNOSTIC_IGNORE("-Wundefined-var-template") + +namespace at::native { + +enum class CPUCapability { + DEFAULT = 0, +#if defined(HAVE_VSX_CPU_DEFINITION) + VSX = 1, +#elif defined(HAVE_ZVECTOR_CPU_DEFINITION) + ZVECTOR = 1, +#else + AVX2 = 1, + AVX512 = 2, +#endif + NUM_OPTIONS +}; + +CPUCapability get_cpu_capability(); + +template +struct DispatchStub; + +/** + * The sole purpose of this class is to outline methods that don't need to be + * specialized or otherwise inlined and duplicated (by the compiler due to + * template expansion), since it causes size bloat if there are a significant + * number of specialization of the DispatchStub<> class. + */ +struct TORCH_API DispatchStubImpl { + void* get_call_ptr( + c10::DeviceType device_type + , void *DEFAULT +#ifdef HAVE_AVX512_CPU_DEFINITION + , void *AVX512 +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , void *AVX2 +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , void *VSX +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , void *ZVECTOR +#endif + ); + + /** + * The CPU Dispatch actual method is chosen in decreasing order of preference by + * DispatchStubImpl::choose_cpu_impl() in case none is found by + * DispatchStubImpl::get_call_ptr() in cpu_dispatch_ptr. + */ + void* choose_cpu_impl( + void *DEFAULT +#ifdef HAVE_AVX512_CPU_DEFINITION + , void *AVX512 +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , void *AVX2 +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , void *VSX +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , void *ZVECTOR +#endif + ); + + // Fixing dispatch error in Windows debug builds. + // See https://github.com/pytorch/pytorch/issues/22681 for more details. + #if defined(_MSC_VER) && defined(_DEBUG) + std::atomic cpu_dispatch_ptr; + void* cuda_dispatch_ptr; + void* hip_dispatch_ptr; + void* mps_dispatch_ptr; + void* privateuse1_dispatch_ptr; + #else + std::atomic cpu_dispatch_ptr{nullptr}; + void* cuda_dispatch_ptr = nullptr; + void* hip_dispatch_ptr = nullptr; + void* mps_dispatch_ptr = nullptr; + void* privateuse1_dispatch_ptr = nullptr; + #endif +}; + +template +struct DispatchStub { + using FnPtr = rT (*) (Args...); + + DispatchStub() = default; + DispatchStub(const DispatchStub&) = delete; + DispatchStub& operator=(const DispatchStub&) = delete; + +private: + FnPtr get_call_ptr(c10::DeviceType device_type) { + return reinterpret_cast( + impl.get_call_ptr(device_type + , reinterpret_cast(DEFAULT) +#ifdef HAVE_AVX512_CPU_DEFINITION + , reinterpret_cast(AVX512) +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + , reinterpret_cast(AVX2) +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + , reinterpret_cast(VSX) +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + , reinterpret_cast(ZVECTOR) +#endif + ) + ); + } + +public: + template + rT operator()(c10::DeviceType device_type, ArgTypes&&... args) { + FnPtr call_ptr = get_call_ptr(device_type); + return (*call_ptr)(std::forward(args)...); + } + + void set_cuda_dispatch_ptr(FnPtr fn_ptr) { + impl.cuda_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + void set_hip_dispatch_ptr(FnPtr fn_ptr) { + impl.hip_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + void set_mps_dispatch_ptr(FnPtr fn_ptr) { + impl.mps_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + void set_privateuse1_dispatch_ptr(FnPtr fn_ptr) { + impl.privateuse1_dispatch_ptr = reinterpret_cast(fn_ptr); + } + + static TORCH_API FnPtr DEFAULT; +#ifdef HAVE_AVX512_CPU_DEFINITION + static TORCH_API FnPtr AVX512; +#endif +#ifdef HAVE_AVX2_CPU_DEFINITION + static TORCH_API FnPtr AVX2; +#endif +#ifdef HAVE_VSX_CPU_DEFINITION + static TORCH_API FnPtr VSX; +#endif +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + static TORCH_API FnPtr ZVECTOR; +#endif +private: + DispatchStubImpl impl; +}; + +namespace { +template +struct RegisterCUDADispatch { + RegisterCUDADispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + stub.set_cuda_dispatch_ptr(value); + } +}; + +template +struct RegisterMPSDispatch { + RegisterMPSDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + stub.set_mps_dispatch_ptr(value); + } +}; + +template +struct RegisterHIPDispatch { + RegisterHIPDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + // TODO: make this point at hip_dispatch_ptr + stub.set_cuda_dispatch_ptr(value); + } +}; + +template +struct RegisterPRIVATEUSE1Dispatch { + RegisterPRIVATEUSE1Dispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) { + stub.set_privateuse1_dispatch_ptr(value); + } +}; + +} // anonymous namespace +// Compiler will complain if you put things like std::tuple in +// the `fn` argument of DECLARE_DISPATCH. Some possible workarounds, e.g., +// adding parentheses and using helper struct to get rid of the parentheses, do +// not work with MSVC. So do a `using`-declaration if you need to pass in such +// `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h. +#define DECLARE_DISPATCH(fn, name) \ + struct name : DispatchStub { \ + name() = default; \ + name(const name&) = delete; \ + name& operator=(const name&) = delete; \ + }; \ + extern TORCH_API struct name name + +#define DEFINE_DISPATCH(name) struct name name + +#define REGISTER_ARCH_DISPATCH(name, arch, fn) \ + template <> name::FnPtr TORCH_API DispatchStub::arch = fn; + +#ifdef HAVE_AVX512_CPU_DEFINITION +#define REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX512, fn) +#else +#define REGISTER_AVX512_DISPATCH(name, fn) +#endif + +#ifdef HAVE_AVX2_CPU_DEFINITION +#define REGISTER_AVX2_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX2, fn) +#else +#define REGISTER_AVX2_DISPATCH(name, fn) +#endif + +#ifdef HAVE_VSX_CPU_DEFINITION +#define REGISTER_VSX_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, VSX, fn) +#else +#define REGISTER_VSX_DISPATCH(name, fn) +#endif + +#ifdef HAVE_ZVECTOR_CPU_DEFINITION +#define REGISTER_ZVECTOR_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, ZVECTOR, fn) +#else +#define REGISTER_ZVECTOR_DISPATCH(name, fn) +#endif + +// Macro to register the same kernel for all CPU arch types. This is useful +// if a kernel does not benefit from being recompiled across different arch types. +#define REGISTER_ALL_CPU_DISPATCH(name, fn) \ + REGISTER_ARCH_DISPATCH(name, DEFAULT, fn) \ + REGISTER_AVX512_DISPATCH(name, fn) \ + REGISTER_AVX2_DISPATCH(name, fn) \ + REGISTER_VSX_DISPATCH(name, fn) \ + REGISTER_ZVECTOR_DISPATCH(name, fn) + +#define REGISTER_NO_CPU_DISPATCH(name) \ + REGISTER_ALL_CPU_DISPATCH(name, nullptr) + +#define REGISTER_CUDA_DISPATCH(name, fn) \ + static RegisterCUDADispatch name ## __register(name, fn); + +#define REGISTER_HIP_DISPATCH(name, fn) \ + static RegisterHIPDispatch name ## __register(name, fn); + +#define REGISTER_MPS_DISPATCH(name, fn) \ + static RegisterMPSDispatch name ## __register(name, fn); + +#define REGISTER_PRIVATEUSE1_DISPATCH(name, fn) \ + static RegisterPRIVATEUSE1Dispatch name ## __register(name, fn); + +// NB: This macro must be used in an actual 'cu' file; if you try using +// it from a 'cpp' file it will not work! +#if defined(__CUDACC__) +#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn) +#elif defined(__HIPCC__) +// TODO: cut this over to HIP dispatch once we stop pretending that CUDA +// is HIP in the PyTorch HIPify build. +#define REGISTER_DISPATCH(name, fn) REGISTER_CUDA_DISPATCH(name, fn) +// #define REGISTER_DISPATCH(name, fn) REGISTER_HIP_DISPATCH(name, fn) +#elif defined(__OBJC__) && defined(USE_MPS) +// NB: this macro must be used from a 'mm' file in order to dispatch a MPS kernel +#define REGISTER_DISPATCH(name, fn) REGISTER_MPS_DISPATCH(name, fn) +#elif defined(CPU_CAPABILITY) +// REGISTER_DISPATCH now dispatches an AVX512 kernel to nullptr but registers other dispatches. +// ALSO_REGISTER_AVX512_DISPATCH should be used for ensuring AVX512 dispatch, among others. +#ifdef CPU_CAPABILITY_AVX512 +#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, nullptr) +#else +#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn) +#endif +#define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn) +#endif +} // namespace at::native + +C10_CLANG_DIAGNOSTIC_POP() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Distance.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Distance.h new file mode 100644 index 0000000000000000000000000000000000000000..c2d881ae66f6af001c255d23cb1acd613af70d5f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Distance.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +namespace at { +class Tensor; + +namespace native { + +using pdist_forward_fn = void(*)(Tensor&, const Tensor&, const double p); +using pdist_backward_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const double p, const Tensor&); +using cdist_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const double p); +using cdist_backward_fn = void(*)(Tensor&, const Tensor&, const Tensor&, const Tensor&, const double p, const Tensor&); + +DECLARE_DISPATCH(pdist_forward_fn, pdist_forward_stub); +DECLARE_DISPATCH(pdist_backward_fn, pdist_backward_stub); +DECLARE_DISPATCH(cdist_fn, cdist_stub); +DECLARE_DISPATCH(cdist_backward_fn, cdist_backward_stub); + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FractionalMaxPooling.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FractionalMaxPooling.h new file mode 100644 index 0000000000000000000000000000000000000000..cb5438a03e7084a0278a0257409edf2bcb9c6fc6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FractionalMaxPooling.h @@ -0,0 +1,80 @@ +#pragma once +#include +#include +#include + +namespace at::native { + +template +static inline std::vector generate_intervals( + scalar_t sample, + int64_t inputSize, + int64_t outputSize, + int64_t poolSize) { + std::vector sequence(outputSize); + if (outputSize > 1) { + scalar_t alpha = static_cast(inputSize - poolSize) / + static_cast(outputSize - 1); + + for (const auto i : c10::irange(outputSize - 1)) { + sequence[i] = + static_cast((i + sample) * alpha) - static_cast(sample * alpha); + } + } + if (outputSize > 0) { + sequence[outputSize - 1] = inputSize - poolSize; + } + return sequence; +} + +template +static inline void fractional_max_pool_check_shape( + const Tensor& input, + const Tensor& randomSamples) { + + TORCH_CHECK( + input.scalar_type() == randomSamples.scalar_type(), + "Expect _random_samples to have the same dtype as input"); + + int64_t ndimension = randomSamples.ndimension(); + TORCH_CHECK( + ndimension == 3, + "Expect _random_samples to have 3 dimensions, got ", ndimension); + + int64_t N = randomSamples.size(0); + int64_t C = randomSamples.size(1); + int64_t D = randomSamples.size(2); + + int64_t input_batch, input_channel; + if (ndim == 2) { + // fractional_max_pool2d + if (input.ndimension() == 3) { + input_batch = 1; + input_channel = input.size(0); + } else { + input_batch = input.size(0); + input_channel = input.size(1); + } + } else { + // factional_max_pool3d + if (input.ndimension() == 4) { + input_batch = 1; + input_channel = input.size(0); + } else { + input_batch = input.size(0); + input_channel = input.size(1); + } + } + + TORCH_CHECK( + N >= input_batch, + "Expect _random_samples.size(0) no less then input batch size."); + TORCH_CHECK( + C == input_channel, + "Expect _random_samples.size(1) equals to input channel size."); + TORCH_CHECK( + D == ndim, + "Expect _random_samples.size(2) equals to ", ndim, "; got ", D, "."); +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FunctionOfAMatrixUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FunctionOfAMatrixUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..68b26ed1381133db9de0ba7cb2187578fb7d680d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/FunctionOfAMatrixUtils.h @@ -0,0 +1,20 @@ +#pragma once + +#include +#include + +namespace at { +struct TensorIterator; + +namespace native { + +using _compute_linear_combination_fn = void(*)( + TensorIterator& iter, + int64_t in_stride, + int64_t coeff_stride, + int64_t num_summations +); + +DECLARE_DISPATCH(_compute_linear_combination_fn, _compute_linear_combination_stub); + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSampler.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSampler.h new file mode 100644 index 0000000000000000000000000000000000000000..aaeb7331c3e88647e8831125c4db59ea7a9b28e3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSampler.h @@ -0,0 +1,298 @@ +#pragma once + +#include +#include +#include +#include + +#include + +namespace at::native { + +using detail::GridSamplerInterpolation; +using detail::GridSamplerPadding; + +// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value, +// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5). +// if align_corners: -1 and +1 get sent to the centers of the corner pixels +// -1 --> 0 +// +1 --> (size - 1) +// scale_factor = (size - 1) / 2 +// if not align_corners: -1 and +1 get sent to the image edges +// -1 --> -0.5 +// +1 --> (size - 1) + 0.5 == size - 0.5 +// scale_factor = size / 2 +template +static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size, + bool align_corners) { + if (align_corners) { + // unnormalize coord from [-1, 1] to [0, size - 1] + return ((coord + 1) / 2) * (size - 1); + } else { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + return ((coord + 1) * size - 1) / 2; + } +} + +// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize +// except that it also returns the `d output / d input` via pointer argument +// `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +static inline scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int64_t size, + bool align_corners, scalar_t *grad_in) { + if (align_corners) { + // unnormalize coord from [-1, 1] to [0, size - 1] + *grad_in = static_cast(size - 1) / 2; + return ((coord + 1) / 2) * (size - 1); + } else { + // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] + *grad_in = static_cast(size) / 2; + return ((coord + 1) * size - 1) / 2; + } +} + +// Clips coordinates to between 0 and clip_limit - 1 +template +static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) { + return std::min(static_cast(clip_limit - 1), std::max(in, static_cast(0))); +} + +// clip_coordinates_set_grad works similarly to clip_coordinates except that +// it also returns the `d output / d input` via pointer argument `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +static inline scalar_t clip_coordinates_set_grad(scalar_t in, int64_t clip_limit, + scalar_t *grad_in) { + // Note that it is important for the gradient calculation that borders + // are considered out of bounds. + if (in <= static_cast(0)) { + *grad_in = static_cast(0); + return static_cast(0); + } else { + scalar_t max = static_cast(clip_limit - 1); + if (in >= max) { + *grad_in = static_cast(0); + return max; + } else { + *grad_in = static_cast(1); + return in; + } + } +} + +// Reflects coordinates until they fall between low and high (inclusive). +// The bounds are passed as twice their value so that half-integer values +// can be represented as ints. +template +static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low, + int64_t twice_high) { + if (twice_low == twice_high) { + return static_cast(0); + } + scalar_t min = static_cast(twice_low) / 2; + scalar_t span = static_cast(twice_high - twice_low) / 2; + in = std::fabs(in - min); + // `fmod` returns same sign as `in`, which is positive after the `fabs` above. + scalar_t extra = std::fmod(in, span); + int flips = static_cast(std::floor(in / span)); + if (flips % 2 == 0) { + return extra + min; + } else { + return span - extra + min; + } +} + +// reflect_coordinates_set_grad works similarly to reflect_coordinates except +// that it also returns the `d output / d input` via pointer argument +// `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +static inline scalar_t reflect_coordinates_set_grad(scalar_t in, int64_t twice_low, + int64_t twice_high, scalar_t *grad_in) { + if (twice_low == twice_high) { + *grad_in = static_cast(0); + return static_cast(0); + } + int grad_in_mult_; + scalar_t min = static_cast(twice_low) / 2; + scalar_t span = static_cast(twice_high - twice_low) / 2; + in = in - min; + if (in < static_cast(0)) { + grad_in_mult_ = -1; + in = -in; + } else { + grad_in_mult_ = 1; + } + // `fmod` returns same sign as `in`, which is positive after the `if` above. + scalar_t extra = std::fmod(in, span); + int flips = static_cast(std::floor(in / span)); + if (flips % 2 == 0) { + *grad_in = static_cast(grad_in_mult_); + return extra + min; + } else { + *grad_in = static_cast(-grad_in_mult_); + return span - extra + min; + } +} + +// Mapping the out-of-boundary points back into boundary +// This would only affect padding_mode=border or reflection +template +static inline scalar_t compute_coordinates(scalar_t coord, int64_t size, + GridSamplerPadding padding_mode, + bool align_corners) { + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + coord = clip_coordinates(coord, size); + } else if (padding_mode == GridSamplerPadding::Reflection) { + // reflect coordinates by image borders + if (align_corners) { + coord = reflect_coordinates(coord, 0, 2*(size - 1)); + } else { + coord = reflect_coordinates(coord, -1, 2*size - 1); + } + // clip coordinates to image borders + coord = clip_coordinates(coord, size); + } + return coord; +} + +// Computes the pixel source index value for a grid coordinate +template +static inline scalar_t grid_sampler_compute_source_index( + scalar_t coord, + int64_t size, + GridSamplerPadding padding_mode, + bool align_corners) { + coord = grid_sampler_unnormalize(coord, size, align_corners); + coord = compute_coordinates(coord, size, padding_mode, align_corners); + return coord; +} + +// grid_sampler_compute_source_index_set_grad works similarly to +// grid_sampler_compute_source_index except that it also returns the +// `d output / d input` via pointer argument `grad_in`. +// This is useful in the backward pass of grid_sampler. +template +static inline scalar_t grid_sampler_compute_source_index_set_grad( + scalar_t coord, + int64_t size, + GridSamplerPadding padding_mode, + bool align_corners, + scalar_t *grad_in) { + scalar_t grad_clip, grad_refl; + coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in); + if (padding_mode == GridSamplerPadding::Border) { + // clip coordinates to image borders + coord = clip_coordinates_set_grad(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_clip; + } else if (padding_mode == GridSamplerPadding::Reflection) { + // reflect coordinates by image borders + if (align_corners) { + coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl); + } else { + coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl); + } + // clip coordinates to image borders + coord = clip_coordinates_set_grad(coord, size, &grad_clip); + *grad_in = (*grad_in) * grad_refl * grad_clip; + } + return coord; +} + +static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) { + return h >= 0 && h < H && w >= 0 && w < W; +} + +static inline bool within_bounds_3d(int64_t d, int64_t h, int64_t w, int64_t D, int64_t H, int64_t W) { + return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W; +} + +template +static inline scalar_t get_value_bounded( + scalar_t* data, + scalar_t x, + scalar_t y, + int64_t W, + int64_t H, + int64_t sW, + int64_t sH, + GridSamplerPadding padding_mode, + bool align_corners) { + + x = compute_coordinates(x, W, padding_mode, align_corners); + y = compute_coordinates(y, H, padding_mode, align_corners); + + int64_t ix = static_cast(x); + int64_t iy = static_cast(y); + + if (within_bounds_2d(iy, ix, H, W)) { + return data[iy * sH + ix * sW]; + } + return static_cast(0); +} + +template +static inline void safe_add_2d(scalar_t *data, int64_t h, int64_t w, + int64_t sH, int64_t sW, int64_t H, int64_t W, + scalar_t delta) { + if (within_bounds_2d(h, w, H, W)) { + data[h * sH + w * sW] += delta; + } +} + +template +static inline void safe_add_3d(scalar_t *data, int64_t d, int64_t h, int64_t w, + int64_t sD, int64_t sH, int64_t sW, + int64_t D, int64_t H, int64_t W, + scalar_t delta) { + if (within_bounds_3d(d, h, w, D, H, W)) { + data[d * sD + h * sH + w * sW] += delta; + } +} + +template +static inline void add_value_bounded( + scalar_t* data, + scalar_t x, + scalar_t y, + int64_t W, + int64_t H, + int64_t sW, + int64_t sH, + scalar_t delta, + GridSamplerPadding padding_mode, + bool align_corners) { + + x = compute_coordinates(x, W, padding_mode, align_corners); + y = compute_coordinates(y, H, padding_mode, align_corners); + + int64_t ix = static_cast(x); + int64_t iy = static_cast(y); + + safe_add_2d(data, iy, ix, sH, sW, H, W, delta); +} + +// Calculate the differential of the cubic convolution, i.e. `d coeff / d x` +template +static inline void get_cubic_coefficients_grad( + scalar_t coeffs[4], + scalar_t t) { + + // Must be the same as forward calculation in + // aten/src/ATen/native/UpSample.h:get_cubic_upsample_coefficients + scalar_t A = -0.75; + + scalar_t x; + x = -1 - t; // 1 < x = |-1 - tx| < 2 + coeffs[0] = (-3 * A * x - 10 * A ) * x - 8 * A; + x = -t; // x = |0 - tx| <= 1 + coeffs[1] = (-3 * (A + 2) * x - 2 * (A + 3)) * x; + x = 1 - t; // x = |1 - tx| <= 1 + coeffs[2] = (3 * (A + 2) * x - 2 * (A + 3)) * x; + x = 2 - t; // 1 < x = |2 - tx| < 2 + coeffs[3] = (3 * A * x - 10 * A) * x + 8 * A; +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSamplerUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSamplerUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..eea21ddf5e3770c4ef23f758afb3ba79b20f3231 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/GridSamplerUtils.h @@ -0,0 +1,109 @@ +#pragma once + +// See NOTE: [Tensor vs. TensorBase] +// https://github.com/pytorch/pytorch/pull/66979 +#include +#include +#include + +namespace at::native { + +namespace detail { + +enum class GridSamplerInterpolation {Bilinear, Nearest, Bicubic}; +enum class GridSamplerPadding {Zeros, Border, Reflection}; + +} // namespace detail + +using detail::GridSamplerInterpolation; +using detail::GridSamplerPadding; + +namespace { + +// See NOTE [ grid_sampler Native Functions ]. +void check_grid_sampler_common( + const TensorBase& input, + const TensorBase& grid +) { + auto input_opt = input.options(); + auto grid_opt = grid.options(); + + TORCH_CHECK( + input.defined(), + "grid_sampler(): expected input to not be undefined"); + TORCH_CHECK( + grid.defined(), + "grid_sampler(): expected grid to not be undefined"); + TORCH_CHECK( + input_opt.device() == grid_opt.device(), + "grid_sampler(): expected input and grid to be on same device, but input " + "is on ", input_opt.device(), " and grid is on ", grid_opt.device()); + TORCH_CHECK( + input_opt.layout() == kStrided && grid_opt.layout() == kStrided, + "grid_sampler(): expected input and grid to have torch.strided layout, but " + "input has ", input_opt.layout(), " and grid has ", grid_opt.layout()); + TORCH_CHECK( + input.size(0) == grid.size(0), + "grid_sampler(): expected grid and input to have same batch size, but got " + "input with sizes ", input.sizes(), " and grid with sizes ", grid.sizes()); + TORCH_CHECK( + grid.size(-1) == input.dim() - 2, + "grid_sampler(): expected grid to have size ", input.dim() - 2, " in last " + "dimension, but got grid with sizes ", grid.sizes()); + + for (const auto i : c10::irange(2, input.dim())) { + TORCH_CHECK(input.size(i) > 0, + "grid_sampler(): expected input to have non-empty spatial dimensions, " + "but input has sizes ", input.sizes(), " with dimension ", i, " being " + "empty"); + } +} + +// See NOTE [ grid_sampler Native Functions ]. +void check_grid_sampler_2d( + const TensorBase& input, + const TensorBase& grid +) { + TORCH_CHECK( + input.dim() == 4 && input.dim() == grid.dim(), + "grid_sampler(): expected 4D input and grid with same number of " + "dimensions, but got input with sizes ", input.sizes(), + " and grid with sizes ", grid.sizes()); +} + +// See NOTE [ grid_sampler Native Functions ]. +void check_grid_sampler_3d( + const TensorBase& input, + const TensorBase& grid, + int64_t interpolation_mode +) { + TORCH_CHECK( + input.dim() == 5 && input.dim() == grid.dim(), + "grid_sampler(): expected 5D input and grid with same number of " + "dimensions, but got input with sizes ", input.sizes(), + " and grid with sizes ", grid.sizes()); + TORCH_CHECK( + !(input.dim() == 5 && + static_cast(interpolation_mode) == + GridSamplerInterpolation::Bicubic), + "grid_sampler(): bicubic interpolation only supports 4D input"); +} + +// See NOTE [ grid_sampler Native Functions ]. +// cudnn does not support inputs larger than 1024. +bool cond_cudnn_grid_sampler( + const TensorBase& input, + const TensorBase& grid +) { + return ( + at::native::cudnn_is_acceptable(input) && + at::native::cudnn_is_acceptable(grid) && + at::native::canUse32BitIndexMath(input) && + at::native::canUse32BitIndexMath(grid) && + input.dim() == 4 && + input.sym_size(1) <= 1024); +} + +} // anonymous namespace + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Lerp.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Lerp.h new file mode 100644 index 0000000000000000000000000000000000000000..6db4f60b88ea1e19a1cc744f9f8c1e7b31c66a82 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Lerp.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include +#include +#include + +namespace at::native { + +template +C10_HOST_DEVICE C10_ALWAYS_INLINE bool is_lerp_weight_small(scalar_t weight) { + return std::abs(weight) < scalar_t(0.5); +} +template +C10_HOST_DEVICE C10_ALWAYS_INLINE bool is_lerp_weight_small(c10::complex weight) { + // Avoid the sqrt in abs(weight) + return (weight.real() * weight.real() + weight.imag() * weight.imag()) < scalar_t(0.25); +} + +template +C10_HOST_DEVICE C10_ALWAYS_INLINE scalar_t lerp(scalar_t self_, scalar_t end_, weight_t weight_) { + using opmath_t = at::opmath_type; + using opmath_weight_t = at::opmath_type; + + opmath_t self = self_; + opmath_t end = end_; + opmath_weight_t weight = weight_; + + // Conditional for better numeric. This has been discussed in + // https://github.com/pytorch/pytorch/pull/18871 + return is_lerp_weight_small(weight) + ? self + weight * (end - self) + : end - (end - self) * (opmath_t(1) - weight); +} + +using lerp_fn_scalar = void (*)( + at::TensorIteratorBase& iter, + const Scalar& weight); + +using lerp_fn_tensor = void (*)( + at::TensorIteratorBase& iter); + +DECLARE_DISPATCH(lerp_fn_scalar, lerp_kernel_scalar_weight); +DECLARE_DISPATCH(lerp_fn_tensor, lerp_kernel_tensor_weight); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..0b05d5162e668c77bb1315b76cf1c683fac16a34 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/LinearAlgebraUtils.h @@ -0,0 +1,623 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#include +#include +#include +#endif + +namespace at::native { + +static inline c10::MaybeOwned expect_resolved_conj(const Tensor& tensor) { + if (tensor.is_conj()) { + return c10::MaybeOwned::owned(tensor.resolve_conj()); + } else { + return c10::MaybeOwned::borrowed(tensor); + } +} + +static inline DimVector batched_matrix_contiguous_strides( + const IntArrayRef sizes, + const bool f_contig = false) { + // f_contig chooses between the strides of a batch of Fortran (F-contiguous) + // and C-contiguous matrices + auto strides = c10::contiguous_strides(sizes); + auto dim = strides.size(); + + if (f_contig && dim >= 2) { + // Fix the strides of the last two dimensions, so that we return + // C-contiguous batches of F-contiguous matrices. + strides[dim - 1] = std::max(sizes[dim - 2], static_cast(1)); + strides[dim - 2] = 1; + } + return strides; +} + +/* + * Clones a Tensor so that the following conditions hold: + * If we think of a Tensor of having size (B, M, N), where B is any number + * of batch dimensions, then: + * - Each (M, N) matrix is in column major form + * - Let Tensor P have size (B, M, N) and Q have size (B, M', N'). + * Then when laid out in memory, the M by N matrix starting at + * P.data_ptr()[B * M * N] is of the same corresponding batch as the M' by N' + * matrix starting at Q.data_ptr()[B * M' * N']. + */ +static inline Tensor cloneBatchedColumnMajor(const Tensor& src) { + // If src is already in batched column major format, then + // this will be efficient (no reordering of the data will occur) + // because the first transpose will make the tensor contiguous, + // and cloning a contiguous tensor is fast. + auto result = src.mT().clone(at::MemoryFormat::Contiguous); + result.transpose_(-2, -1); + return result; +} + +/* + * contig chooses between C-contig (true) and F-contig (false) + */ +static inline c10::MaybeOwned borrow_else_clone(const bool cond, const Tensor& borrow, const Tensor& clone, const bool contig) { + return cond ? c10::MaybeOwned::borrowed(borrow) + : c10::MaybeOwned::owned(contig ? clone.clone(MemoryFormat::Contiguous) + : cloneBatchedColumnMajor(clone)); +} + +/* + * This method is designed to be a faster alternative to + * `cloneBatchedColumnMajor` with some additional features, + * namely: + * 1. It uses `copy` instead of `clone` which could be much faster. + * 2. `nrows` parameter used to create inputs with the number of rows larger + * than the original input, which is required for some LAPACK/MAGMA methods. + * 3. `desired_batch_size` is used to create copies with the batch size + * which is either the original batch size of the input, or its larger + * broadcasted shape. + */ +static inline Tensor copyBatchedColumnMajor(const Tensor& src, int64_t nrows = -1, + at::OptionalIntArrayRef desired_batch_sizes = c10::nullopt) { + nrows = (nrows == -1) ? src.size(-2) : nrows; + auto copy_sizes = desired_batch_sizes.has_value() + ? desired_batch_sizes.value().vec() + : IntArrayRef(src.sizes().data(), src.dim() - 2).vec(); + copy_sizes.insert(copy_sizes.end(), {nrows, src.size(-1)}); + const auto copy_strides = batched_matrix_contiguous_strides(copy_sizes, /*f-contig*/true); + auto copy = at::empty_strided(copy_sizes, copy_strides, src.options()); + copy.narrow(-2, 0, src.size(-2)).copy_(src); + return copy; +} + +/* + * Given batches of matrices with arbitrary batch dim, + * computes the number of batches. + */ +static inline int64_t batchCount(const Tensor& batched_matrices) { + int64_t result = 1; + for (int64_t i = 0; i < batched_matrices.ndimension() - 2; i++) { + result *= batched_matrices.size(i); + } + return result; +} + +// Computes the number of elements of a matrix in a batched matrix tensor +static inline int64_t matrixStride(const Tensor& batched_matrices) { + return batched_matrices.size(-1) * batched_matrices.size(-2); +} + +// Validates input shapes for operations on batches of square matrices (inverse, cholesky, symeig, eig) +static inline void checkIsMatrix(const Tensor& A, const char* const f_name, const char* const arg_name = "A") { + TORCH_CHECK(A.dim() >= 2, f_name, ": The input tensor ", arg_name, " must have at least 2 dimensions."); +} +static inline void squareCheckInputs(const Tensor& self, const char* const f_name, const char* const arg_name = "A") { + checkIsMatrix(self, f_name, arg_name); + TORCH_CHECK(self.sym_size(-1) == self.sym_size(-2), + f_name, + ": ", arg_name, " must be batches of square matrices, " + "but they are ", self.sym_size(-2), " by ", self.sym_size(-1), " matrices"); +} + +static inline void checkInputsSolver(const Tensor& A, + const Tensor& B, + const bool left, + const char* const f_name) { + squareCheckInputs(A, f_name, "A"); + checkIsMatrix(B, f_name, "B"); + TORCH_CHECK(left ? A.size(-2) == B.size(-2) : A.size(-1) == B.size(-1), + f_name, ": Incompatible shapes of A and B for the equation ", + left ? "AX = B" : "XA = B", + " (", A.size(-2), "x", A.size(-1), " and ", B.size(-2), "x", B.size(-1), ")"); +} + +static inline bool is_row_or_column_contiguous(const Tensor& t) { + // This could be made more general, similar to how it's checked in matmul, which would allow to + // ellide the copy with strides such as (6, 12, 1, 3) or (3, 1, 9), but this is quite tricky. + // We choose to be conservative for simplicity + return t.is_contiguous() || t.transpose(-2, -1).is_contiguous(); +} + +static inline TransposeType to_transpose_type(const bool contig, const bool conj) { + if (conj) { + if (contig) { TORCH_INTERNAL_ASSERT(false, "Invalid transpose type"); } + else { return TransposeType::ConjTranspose; } + } else { + if (contig) { return TransposeType::NoTranspose; } + else { return TransposeType::Transpose; } + } +} + + +// This function is designed to be used with linear algebra methods that minimize +// L(ax - b) = 0, where L is generally the identity map (`solve`, for example) +// or the L2 norm (`lstsq`). +// It is expected that `a` and `b` are contiguous tensors of column-major matrices +// (so that a.view({-1, a.size(-2), a.size(-1)}) succeeds, same for `b`), +// with the following additional properties: +// +// 1. a.dim() == b.dim() +// 2. a.shape[:-2] broadcasts over b.shape[:-2] +// 3. a.size(i) <= b.size(i) for i=0,..., a.dim() - 3 (only for batch dimensions) +// +// MAGMA/LAPACK modify tensor `a` in-place, and the main goal of this method +// is to be memory efficient, which means that if there exists an index i such that +// a.shape[i] < b.shape[i], 0 <= i <= a.dim() - 3, +// then instead of materializing copies of `a` in the broadcasted shape, we keep +// a buffer copy of `a` along with flags that check whether specific batch dimension +// indices for `a` were already accessed. If they were, we copy the data from the buffer +// into `a`. The number of copies does not exceed +// prod(max(a.shape[:-2], b.shape[:-2]) - a.shape[:-2] + 1) +// and this value is attained by tensors with non-empty batch dimensions. +// +// func_t `f` is a callable that is being supplied with +// scalar_t* a_working_ptr, scalar_t* b_working_ptr, int64_t a_linear_batch_idx. +// a_working_ptr and b_working_ptr can directly be passed to LAPACK/MAGMA routines, +// and a_linear_batch_idx is an index in the 3d representation which corresponds to +// the memory a_working_ptr points to, in other words: +// a_working_ptr == a.view({-1, a.size(-2), a.size(-1)}.select(0, a_linear_batch_idx).data_ptr(); +// a_linear_batch_idx is useful to store metadata related to `a`, such as, for example, +// its rank or singular values (see linalg_lstsq). +template +void batch_iterator_with_broadcasting(const Tensor& a, const Tensor& b, const func_t& f) { + IntArrayRef a_batch_sizes(a.sizes().data(), a.dim() - 2); + IntArrayRef b_batch_sizes(b.sizes().data(), b.dim() - 2); + + auto a_linear_batch_idx = at::arange(batchCount(a)).view(a_batch_sizes); + auto b_linear_batch_idx = at::arange(batchCount(b)).view(b_batch_sizes); + + TensorIterator iter = TensorIteratorConfig() + .set_check_mem_overlap(false) + .check_all_same_dtype(false) + .resize_outputs(false) + .add_output(b_linear_batch_idx) + .add_input(a_linear_batch_idx) + .build(); + + auto m = a.size(-2); + auto n = a.size(-1); + auto a_3d = a.view({batchCount(a), m, n}); + auto b_3d = b.view({batchCount(b), b.size(-2), b.size(-1)}); + + auto a_broadcasts_over_b = (a_batch_sizes != b_batch_sizes); + Tensor a_buffer, a_was_accessed, a_buffer_3d; + std::function check_if_copy_needed_for_a + = [](int64_t /*a_curr_linear_batch_idx*/){}; + if (a_broadcasts_over_b) { + a_buffer = at::empty_strided(a.sizes(), a.strides(), a.options()) + .copy_(a); + a_was_accessed = at::zeros(batchCount(a), at::kBool); + a_buffer_3d = a_buffer.view({batchCount(a), m, n}); + check_if_copy_needed_for_a = [&](int64_t a_curr_linear_batch_idx) { + auto* a_was_accessed_flag = a_was_accessed + .select(0, a_curr_linear_batch_idx) + .data_ptr(); + if (!(*a_was_accessed_flag)) { + *a_was_accessed_flag = true; + } + else { + a_3d.select(0, a_curr_linear_batch_idx) + .copy_(a_buffer_3d.select(0, a_curr_linear_batch_idx)); + } + }; + } + + auto loop = [&](char** data, const int64_t* strides, int64_t nelems) { + auto* b_batch_idx_ptr = data[0]; + auto* a_batch_idx_ptr = data[1]; + + for (const auto elem C10_UNUSED : c10::irange(nelems)) { + auto b_curr_linear_batch_idx = *reinterpret_cast(b_batch_idx_ptr); + auto a_curr_linear_batch_idx = *reinterpret_cast(a_batch_idx_ptr); + + check_if_copy_needed_for_a(a_curr_linear_batch_idx); + + auto* a_working_ptr = a_3d.select(0, a_curr_linear_batch_idx) + .data_ptr(); + auto* b_working_ptr = b_3d.select(0, b_curr_linear_batch_idx) + .data_ptr(); + f(a_working_ptr, b_working_ptr, a_curr_linear_batch_idx); + + b_batch_idx_ptr += strides[0]; + a_batch_idx_ptr += strides[1]; + } + }; + iter.serial_for_each(loop, {0, batchCount(b)}); +} + +// Returns the epsilon value for floating types except half +static inline double _get_epsilon(const ScalarType& sc_type) { + switch (sc_type) { + case at::ScalarType::Float: + return static_cast(std::numeric_limits::epsilon()); + case at::ScalarType::Double: + return std::numeric_limits::epsilon(); + default: + AT_ERROR("This function doesn't handle types other than float and double"); + } +} + +// Validates input shapes and devices +// for linear solve methods (solve, cholesky_solve, lu_solve, triangular_solve) +static inline void linearSolveCheckInputs(const Tensor& self, const Tensor& A, const char* name) { + TORCH_CHECK(self.device() == A.device(), + "Expected b and A to be on the same device, but found b on ", + self.device(), " and A on ", A.device(), " instead."); + + TORCH_CHECK(self.scalar_type() == A.scalar_type(), + "Expected b and A to have the same dtype, but found b of type ", + self.scalar_type(), " and A of type ", A.scalar_type(), " instead."); + + TORCH_CHECK(A.size(-1) == A.size(-2), + "A must be batches of square matrices, " + "but they are ", A.size(-2), " by ", A.size(-1), " matrices"); + + TORCH_CHECK(A.size(-1) == self.size(-2), + "Incompatible matrix sizes for ", name, ": each A " + "matrix is ", A.size(-1), " by ", A.size(-1), + " but each b matrix is ", self.size(-2), " by ", self.size(-1)); +} + +static inline void checkFloatingOrComplex(const Tensor& t, const char* const f_name, const bool allow_low_precision_dtypes=true) { + auto dtype = t.scalar_type(); + TORCH_CHECK((at::isFloatingType(dtype) || at::isComplexType(dtype)), + f_name, ": Expected a floating point or complex tensor as input. Got ", dtype); + if (!allow_low_precision_dtypes) { + TORCH_CHECK(dtype == kFloat || dtype == kDouble || dtype == kComplexFloat || dtype == kComplexDouble, + f_name, ": Low precision dtypes not supported. Got ", dtype); + } +} + + +// Checks if all the Tensors in a TensorList are of the same dimensions +static inline void checkAllSameDim(TensorList tensors, int64_t dim) { + for (auto &t : tensors) { + TORCH_CHECK(t.dim() == dim, "Tensor dimension is ", t.dim(), ", expected ", dim, " instead."); + } +} + +static inline std::tuple, std::vector> _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2) { + // broadcast the batch dimensions of arg1 and arg2. + IntArrayRef arg1_batch_sizes(arg1.sizes().data(), arg1.ndimension() - 2); + IntArrayRef arg2_batch_sizes(arg2.sizes().data(), arg2.ndimension() - 2); + std::vector expand_batch_portion = infer_size(arg1_batch_sizes, arg2_batch_sizes); + + std::vector arg1_expand_size({expand_batch_portion}); + arg1_expand_size.insert(arg1_expand_size.end(), { arg1.size(-2), arg1.size(-1) }); + + std::vector arg2_expand_size({expand_batch_portion}); + arg2_expand_size.insert(arg2_expand_size.end(), { arg2.size(-2), arg2.size(-1) }); + return std::make_tuple(std::move(arg1_expand_size), std::move(arg2_expand_size)); +} + +static inline std::tuple _linalg_broadcast_batch_dims(const Tensor& arg1, const Tensor& arg2, const char* name) { + // If there's no name we assume we don't want to check the errors + if (name != nullptr) { + linearSolveCheckInputs(arg1, arg2, name); + } + + auto [arg1_expand_size, arg2_expand_size] = at::native::_linalg_broadcast_batch_dims(arg1, arg2); + + auto arg1_broadcasted = arg1_expand_size == arg1.sizes() ? arg1 : arg1.expand(arg1_expand_size); + auto arg2_broadcasted = arg2_expand_size == arg2.sizes() ? arg2 : arg2.expand(arg2_expand_size); + return std::make_tuple(arg1_broadcasted, arg2_broadcasted); +} + +static inline std::vector broadcast_batch_size(const Tensor& t1, const Tensor& t2, int64_t n_batch_dims) { + IntArrayRef t1_batch_sizes(t1.sizes().data(), n_batch_dims); + IntArrayRef t2_batch_sizes(t2.sizes().data(), n_batch_dims); + auto broadcasted_batch_sizes = infer_size(t1_batch_sizes, t2_batch_sizes); + return broadcasted_batch_sizes; +} + +// Return a permutation with the given axes moved to the end. +static inline Tensor _move_to_end(const Tensor& self, IntArrayRef axes) { + const std::vector a = axes.vec(); + const int64_t ndim = self.ndimension(); + std::vector perm; + + for (const auto i : c10::irange(ndim)) { + auto it = std::find(a.begin(), a.end(), i); + if (it == a.end()) { + perm.push_back(i); + } + } + for (auto i : a) { + perm.push_back(i); + } + + TORCH_CHECK((int64_t)perm.size() == ndim, + "duplicate or invalid axis in 'dim' argument for tensor with ndim==", ndim); + + return self.permute(perm); +} + +// parse the "mode" param in linalg_qr: return a tuple of bools (compute_q, reduced) +static inline std::tuple _parse_qr_mode(c10::string_view mode) { + bool compute_q; + bool reduced; + if (mode == "reduced") { + compute_q = true; + reduced = true; + } else if (mode == "complete") { + compute_q = true; + reduced = false; + } else if (mode == "r") { + compute_q = false; + reduced = true; // this is actually irrelevant in this mode + } else { + TORCH_CHECK(false, "qr received unrecognized mode '", mode, + "' but expected one of 'reduced' (default), 'r', or 'complete'"); + } + return std::make_tuple(compute_q, reduced); +} + +// Function to compute sizes, strides and the extra columns for the Q matrix in the QR Decomposition +static inline std::tuple _compute_geometry_for_Q( + const Tensor& input, + bool reduced) { + int64_t m = input.size(-2), n = input.size(-1); + int64_t n_columns_q; + + // We need to compute the required size of Q based on the `reduced` option + DimVector q_sizes(input.sizes()); + if (!reduced && m > n) { + q_sizes[input.dim() - 1] = m; + n_columns_q = m; + } else { + q_sizes[input.dim() - 1] = n; + n_columns_q = std::min(m, n); + } + auto q_strides = batched_matrix_contiguous_strides(q_sizes, /*f-contig*/true); + return std::make_tuple(q_sizes, q_strides, n_columns_q); +} + +static inline bool svd_uses_cusolver(const Tensor& A) { + // if cusolver is available, it is used unconditionally + return A.is_cuda() + && at::globalContext().hasCuSOLVER() + && at::globalContext().linalgPreferredBackend() != at::LinalgBackend::Magma; +} + + +// Function used instead of .to so that the original strides are retained +// .to doesn't retain strides and make the output tensor contiguous +static inline Tensor same_stride_to(const Tensor& original_tensor, const at::TensorOptions& options) { + auto strided_to = at::empty_strided(original_tensor.sizes(), + original_tensor.strides(), + options); + strided_to.copy_(original_tensor); + return strided_to; +} + +// Creates a dimension permutation array that can be given to `at::permute()`, which will shift +// the two specified dimensions to the end of a tensor, without changing the order of +// the other dimensions. `dim1` will be placed at the very end, and `dim0` will be +// placed just to the left of it. +// +// For instance, given a 4-D tensor, dimensions 1 and 3 can be shifted to the end by +// calling `create_dim_backshift_permutation(1, 3, 4)`. The resulting vector will +// be `vec(0, 2, 1, 3)`. +static inline std::vector create_dim_backshift_permutation(int64_t dim0, int64_t dim1, int64_t ndim) { + TORCH_CHECK( + (dim0 != dim1) && (dim0 < ndim) && (dim0 >= 0) && (dim1 < ndim) && (dim1 >= 0), + "duplicate or invalid dimensions"); + std::vector permutation(ndim); + int64_t cur_permuted_dim = 0; + for (const auto dim_ind : c10::irange(ndim)) { + if ((dim_ind != dim0) && (dim_ind != dim1)) { + permutation[cur_permuted_dim++] = dim_ind; + } + } + permutation[cur_permuted_dim++] = dim0; + permutation[cur_permuted_dim] = dim1; + return permutation; +} + +// Creates a dimension permutation array that can be given to `at::permute()`, which +// will reverse a given permutation. +// The reverse permutation array is created by swapping the indices and their +// associated values from the given permutation array. +static inline std::vector create_reverse_permutation(std::vector permutation) { + int64_t ndim = permutation.size(); + std::vector reverse_permutation(ndim); + for (const auto dim_ind : c10::irange(ndim)) { + reverse_permutation[permutation[dim_ind]] = dim_ind; + } + return reverse_permutation; +} + +// Compute R-work array size for MAGMA/LAPACK cgesdd/zgesdd +// See https://github.com/Reference-LAPACK/lapack/blob/122506cd8b6ce050a200920c3d4c0b153b150fd8/SRC/cgesdd.f#L186 +static inline int64_t computeLRWorkDim(const char jobz, int64_t m, int64_t n) { + auto mn = std::min(m, n); + auto mx = std::max(m, n); + if (jobz == 'N') { +#ifdef __APPLE__ + // According to `vecLib.framework/Headers/clapack.h` Accelerate.framework is based on LAPACK 3.2.1 + return 7 * mn; +#else + // These setting is valid for on LAPACK 3.6+ + return 5 * mn; +#endif + } + if (mx > 10 * mn) { + return 5 * mn * mn + 5 * mn; + } + return std::max(5 * mn * mn + 5 * mn, 2 * mx * mn + 2 * mn * mn + mn); +} + +// This function checks whether the uplo argument input is valid +// Allowed strings are "u", "U", "l", "L" +static inline void checkUplo(const c10::string_view uplo) { + // To use std::toupper safely with plain chars (or signed chars), the argument should first be converted to unsigned char + char uplo_uppercase = static_cast(std::toupper(static_cast(uplo[0]))); + TORCH_CHECK(uplo.size() == 1 && (uplo_uppercase == 'U' || uplo_uppercase == 'L'), + "Expected UPLO argument to be 'L' or 'U', but got ", uplo); +} + +static inline void checkSameDevice(const std::string& fn_name, Tensor result, Tensor input, const std::string& result_name = "result") { + TORCH_CHECK( + result.device() == input.device(), + fn_name, + ": Expected ", result_name, " and input tensors to be on the same device, but got ", + result_name, " on ", result.device(), " and input on ", input.device()); +} + +// Check the dtype of result and input tensors (for _out variants). +// Most linear algebra functions have the same dtype for input and output +// (either floating or complex type input), so we can check whether input's dtype can be casted to result's dtype. +// According to https://github.com/pytorch/pytorch/wiki/Developer-FAQ#how-does-out-work-in-pytorch +// c10::canCast is used for checking the "safe copy" dtype requirements. +static inline void checkLinalgCompatibleDtype(const std::string& fn_name, Tensor result, Tensor input, const std::string& result_name = "result") { + bool can_cast = c10::canCast(input.scalar_type(), result.scalar_type()); + TORCH_CHECK( + can_cast, + fn_name, + ": Expected ", result_name, " to be safely castable from ", input.scalar_type(), " dtype, but got ", + result_name, " with dtype ", result.scalar_type()); +} + +// Alternatively, we can check whether the specific expected output type (result_type) can be safely casted to out tensor dtype (out_type) +static inline void checkLinalgCompatibleDtype(const std::string& fn_name, ScalarType out_type, ScalarType result_type, const std::string& out_name = "result") { + bool can_cast = c10::canCast(result_type, out_type); + TORCH_CHECK( + can_cast, + fn_name, + ": Expected ", out_name, " to be safely castable from ", result_type, " dtype, but got ", + out_name, " with dtype ", out_type); +} + +static inline void checkNotComplexTolerance(const Tensor& tol, const c10::string_view f_name, const c10::string_view tol_name) { + TORCH_CHECK(!at::isComplexType(tol.scalar_type()), + f_name, ": ", tol_name, " tensor of complex type is not supported. Got ", tol.scalar_type()); +} + +/* + Two types of 'other' tensors are supported when solving + a system of linear equations matmul(input, x) = other: + * 1-dimensional (1D) tensor or batch of 1D tensors (vector case) + * 2-dimensional (2D) tensor or batch of 2D tensors (matrix case). + The original torch.solve supported only the matrix case, while NumPy works for both cases. + For the batched input we need to be able to distinguish them. + Let input.shape = (batch_dimensions, m, n), then 'other' is of vector type if other.shape == (batch_dimensions, m). + This rule is compatible with NumPy, see https://github.com/numpy/numpy/blob/v1.20.0/numpy/linalg/linalg.py#L384-L389 +*/ +static inline bool linalg_solve_is_vector_rhs(const Tensor& input, const Tensor& other) { + auto expected_batched_rhs_shape = SymIntArrayRef(input.sym_sizes().data(), input.dim() - 1); // input.shape[:-1] + bool vector_case = other.dim() == 1 || (input.dim() - 1 == other.dim() && other.sym_sizes().equals(expected_batched_rhs_shape)); + return vector_case; +} + +/* + Computes linear indices for a tensor with original_shape to access its elements like it was a materialized broadcast tensor. +*/ +static inline Tensor get_linear_indices(int64_t numel, IntArrayRef original_shape, IntArrayRef broadcast_shape) { + TensorOptions options = at::TensorOptions().dtype(at::kLong).device(at::kCPU); + return at::arange(numel, options).view(original_shape).broadcast_to(broadcast_shape).contiguous(); +} + +class BroadcastLinearIndices { + private: + Tensor linear_indices_; + bool is_broadcasting_; + + public: + BroadcastLinearIndices( + int64_t numel, + IntArrayRef original_shape, + IntArrayRef broadcast_shape) : is_broadcasting_(!original_shape.equals(broadcast_shape)) { + // The assumption is that the broadcast_shape is a materialized broadcast + // shape of the original_shape. We need to compute the linear indices + // compatible with the original_shape to access the elements in the original + // tensor corresponding to the broadcast tensor. + if (is_broadcasting_) { + linear_indices_ = + get_linear_indices(numel, original_shape, broadcast_shape); + } + } + int64_t operator()(int64_t broadcast_linear_index) { + return is_broadcasting_ + ? linear_indices_.data_ptr()[broadcast_linear_index] + : broadcast_linear_index; + } +}; + +static inline bool is_blas_compatible_column_major_order(const Tensor& input) { + IntArrayRef input_strides = input.strides(); + IntArrayRef input_sizes = input.sizes(); + auto ndim = input.dim(); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2); + if (ndim > 3) { + return input.transpose(-2, -1).is_contiguous(); + } + auto leading_dimension = input_strides[ndim - 1]; + auto rows = input_sizes[ndim - 2]; + bool batch_stride_compatible = true; + if (ndim == 3) { + auto cols = input_sizes[ndim - 1]; + batch_stride_compatible = + input_strides[ndim - 3] >= leading_dimension * cols; + } + return (input_strides[ndim - 2] == 1) && + (leading_dimension >= std::max(1, rows)) && + batch_stride_compatible; +} + +static inline bool is_blas_compatible_row_major_order(const Tensor& input) { + IntArrayRef input_strides = input.strides(); + IntArrayRef input_sizes = input.sizes(); + auto ndim = input.dim(); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(ndim >= 2); + if (ndim > 3) { + return input.is_contiguous(); + } + auto leading_dimension = input_strides[ndim - 2]; + auto cols = input_sizes[ndim - 1]; + bool batch_stride_compatible = true; + if (ndim == 3) { + auto rows = input_sizes[ndim - 2]; + batch_stride_compatible = + input_strides[ndim - 3] >= leading_dimension * rows; + } + return (input_strides[ndim - 1] == 1) && + (leading_dimension >= std::max(1, cols)) && + batch_stride_compatible; +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitFallThroughLists.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitFallThroughLists.h new file mode 100644 index 0000000000000000000000000000000000000000..97b0854d82d0a2fec6bb708db767d81273ec7bcc --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/MathBitFallThroughLists.h @@ -0,0 +1,71 @@ +#pragma once + +namespace at { +// views and their in-place version ops +#define TORCH_VIEW_FNS(m) \ + m.impl("as_strided_", torch::CppFunction::makeFallthrough()); \ + m.impl("detach", torch::CppFunction::makeFallthrough()); \ + m.impl("detach_", torch::CppFunction::makeFallthrough()); \ + m.impl("diagonal", torch::CppFunction::makeFallthrough()); \ + m.impl("expand", torch::CppFunction::makeFallthrough()); \ + m.impl("expand_as", torch::CppFunction::makeFallthrough()); \ + m.impl("movedim.int", torch::CppFunction::makeFallthrough()); \ + m.impl("movedim.intlist", torch::CppFunction::makeFallthrough()); \ + m.impl("narrow", torch::CppFunction::makeFallthrough()); \ + m.impl("permute", torch::CppFunction::makeFallthrough()); \ + m.impl("select.Dimname", torch::CppFunction::makeFallthrough()); \ + m.impl("select.int", torch::CppFunction::makeFallthrough()); \ + m.impl("squeeze", torch::CppFunction::makeFallthrough()); \ + m.impl("squeeze_", torch::CppFunction::makeFallthrough()); \ + m.impl("transpose.int", torch::CppFunction::makeFallthrough()); \ + m.impl("transpose.Dimname", torch::CppFunction::makeFallthrough()); \ + m.impl("transpose_", torch::CppFunction::makeFallthrough()); \ + m.impl("t", torch::CppFunction::makeFallthrough()); \ + m.impl("t_", torch::CppFunction::makeFallthrough()); \ + m.impl("real", torch::CppFunction::makeFallthrough()); \ + m.impl("imag", torch::CppFunction::makeFallthrough()); \ + m.impl("view_as_real", torch::CppFunction::makeFallthrough()); \ + m.impl("unflatten.int", torch::CppFunction::makeFallthrough()); \ + m.impl("unflatten.Dimname", torch::CppFunction::makeFallthrough()); \ + m.impl("unfold", torch::CppFunction::makeFallthrough()); \ + m.impl("unsqueeze", torch::CppFunction::makeFallthrough()); \ + m.impl("unsqueeze_", torch::CppFunction::makeFallthrough()); \ + m.impl("view_as", torch::CppFunction::makeFallthrough()); \ + m.impl("unbind.int", torch::CppFunction::makeFallthrough()); \ + m.impl("unbind.Dimname", torch::CppFunction::makeFallthrough()); \ + m.impl("split.Tensor", torch::CppFunction::makeFallthrough()); \ + m.impl("split_with_sizes", torch::CppFunction::makeFallthrough()); \ + m.impl("swapaxes", torch::CppFunction::makeFallthrough()); \ + m.impl("swapdims", torch::CppFunction::makeFallthrough()); \ + m.impl("chunk", torch::CppFunction::makeFallthrough()); \ + m.impl("reshape", torch::CppFunction::makeFallthrough()); \ + m.impl("alias", torch::CppFunction::makeFallthrough()); \ + m.impl("hsplit.int", torch::CppFunction::makeFallthrough()); \ + m.impl("hsplit.array", torch::CppFunction::makeFallthrough()); \ + m.impl("dsplit.int", torch::CppFunction::makeFallthrough()); \ + m.impl("dsplit.array", torch::CppFunction::makeFallthrough()); \ + m.impl("vsplit.int", torch::CppFunction::makeFallthrough()); \ + m.impl("vsplit.array", torch::CppFunction::makeFallthrough()); \ + m.impl("conj", torch::CppFunction::makeFallthrough()); \ + m.impl("_conj", torch::CppFunction::makeFallthrough()); \ + m.impl("_unsafe_view", torch::CppFunction::makeFallthrough()); \ + m.impl("resize_", torch::CppFunction::makeFallthrough()); + +#define TENSOR_UTILITIES_AND_CONSTRUCTORS(m) \ + m.impl("empty_like", torch::CppFunction::makeFallthrough()); \ + m.impl("empty.memory_format", torch::CppFunction::makeFallthrough()); \ + m.impl("empty.out", torch::CppFunction::makeFallthrough()); \ + m.impl("empty_strided", torch::CppFunction::makeFallthrough()); \ + m.impl("full_like", torch::CppFunction::makeFallthrough()); \ + m.impl("stride.int", torch::CppFunction::makeFallthrough()); \ + m.impl("stride.Dimname", torch::CppFunction::makeFallthrough()); \ + m.impl("size.int", torch::CppFunction::makeFallthrough()); \ + m.impl("size.Dimname", torch::CppFunction::makeFallthrough()); \ + m.impl("is_complex", torch::CppFunction::makeFallthrough()); \ + m.impl("is_floating_point", torch::CppFunction::makeFallthrough()); \ + m.impl("requires_grad_", torch::CppFunction::makeFallthrough()); +} + +#define TORCH_VIEW_FNS_NATIVE_FN_REGISTRATION(m) \ + m.impl("as_strided", torch::CppFunction::makeFallthrough()); \ + m.impl("view", torch::CppFunction::makeFallthrough()); diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Sorting.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Sorting.h new file mode 100644 index 0000000000000000000000000000000000000000..1ab806645fbf144fe9df5f41497f4ca753e8573f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/Sorting.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include + +namespace at { +class TensorBase; +} + +namespace at::native { + +enum class QUANTILE_INTERPOLATION_MODE : uint8_t { + LINEAR, + LOWER, + HIGHER, + MIDPOINT, + NEAREST +}; + +using sort_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, bool, bool); +using topk_fn = void(*)(const TensorBase&, const TensorBase&, const TensorBase&, int64_t, int64_t, bool, bool); + +DECLARE_DISPATCH(sort_fn, sort_stub); +DECLARE_DISPATCH(topk_fn, topk_stub); + +void _fill_indices(const TensorBase &indices, int64_t dim); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SpectralOpsUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SpectralOpsUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..279e4ff59556793709e864ef79352275f1d148cf --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/SpectralOpsUtils.h @@ -0,0 +1,84 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::native { + +// Normalization types used in _fft_with_size +enum class fft_norm_mode { + none, // No normalization + by_root_n, // Divide by sqrt(signal_size) + by_n, // Divide by signal_size +}; + +// NOTE [ Fourier Transform Conjugate Symmetry ] +// +// Real-to-complex Fourier transform satisfies the conjugate symmetry. That is, +// assuming X is the transformed K-dimensionsal signal, we have +// +// X[i_1, ..., i_K] = X[j_i, ..., j_K]*, +// +// where j_k = (N_k - i_k) mod N_k, N_k being the signal size at dim k, +// * is the conjugate operator. +// +// Therefore, in such cases, FFT libraries return only roughly half of the +// values to avoid redundancy: +// +// X[:, :, ..., :floor(N / 2) + 1] +// +// This is also the assumption in cuFFT and MKL. In ATen SpectralOps, such +// halved signal will also be returned by default (flag onesided=True). +// The following infer_ft_real_to_complex_onesided_size function calculates the +// onesided size from the twosided size. +// +// Note that this loses some information about the size of signal at last +// dimension. E.g., both 11 and 10 maps to 6. Hence, the following +// infer_ft_complex_to_real_onesided_size function takes in optional parameter +// to infer the twosided size from given onesided size. +// +// cuFFT doc: http://docs.nvidia.com/cuda/cufft/index.html#multi-dimensional +// MKL doc: https://software.intel.com/en-us/mkl-developer-reference-c-dfti-complex-storage-dfti-real-storage-dfti-conjugate-even-storage#CONJUGATE_EVEN_STORAGE + +inline int64_t infer_ft_real_to_complex_onesided_size(int64_t real_size) { + return (real_size / 2) + 1; +} + +inline int64_t infer_ft_complex_to_real_onesided_size(int64_t complex_size, + int64_t expected_size=-1) { + int64_t base = (complex_size - 1) * 2; + if (expected_size < 0) { + return base + 1; + } else if (base == expected_size) { + return base; + } else if (base + 1 == expected_size) { + return base + 1; + } else { + std::ostringstream ss; + ss << "expected real signal size " << expected_size << " is incompatible " + << "with onesided complex frequency size " << complex_size; + AT_ERROR(ss.str()); + } +} + +using fft_fill_with_conjugate_symmetry_fn = + void (*)(ScalarType dtype, IntArrayRef mirror_dims, IntArrayRef half_sizes, + IntArrayRef in_strides, const void* in_data, + IntArrayRef out_strides, void* out_data); +DECLARE_DISPATCH(fft_fill_with_conjugate_symmetry_fn, fft_fill_with_conjugate_symmetry_stub); + +// In real-to-complex transform, cuFFT and MKL only fill half of the values +// due to conjugate symmetry. This function fills in the other half of the full +// fft by using the Hermitian symmetry in the signal. +// self should be the shape of the full signal and dims.back() should be the +// one-sided dimension. +// See NOTE [ Fourier Transform Conjugate Symmetry ] +TORCH_API void _fft_fill_with_conjugate_symmetry_(const Tensor& self, IntArrayRef dims); + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorTransformations.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorTransformations.h new file mode 100644 index 0000000000000000000000000000000000000000..f69c27edb976a4157dca1b0e0a38d748cdef9848 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorTransformations.h @@ -0,0 +1,30 @@ +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +#include + +namespace at::native { + +static inline Tensor roll_common(const Tensor& self, IntArrayRef shifts, IntArrayRef dims) { + TORCH_CHECK(!shifts.empty(), "`shifts` required"); + if (dims.empty() && shifts.size() == 1) { + auto flattened = self.contiguous().view(self.numel()); + return roll(flattened, shifts[0], 0).view(self.sizes()); + } + TORCH_CHECK( + shifts.size() == dims.size(), + "shifts and dimensions must align. shifts: ", shifts.size(), ", dims:", dims.size() + ); + AT_ASSERT(dims.size() > 1); + auto tail_shifts = shifts.slice(1); + auto tail_dims = dims.slice(1); + auto first_dim_rolled = roll(self, shifts[0], dims[0]); + return at::roll(first_dim_rolled, tail_shifts, tail_dims); +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/GridSamplerKernel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/GridSamplerKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..b1830fcd3911ec871ee9f1728f2cfcbf1c625031 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/GridSamplerKernel.h @@ -0,0 +1,34 @@ +#pragma once + +#include + +#include +#include + +namespace at { +class TensorBase; +} + +namespace at { namespace native { + +using forward_2d_fn = void (*) ( + const TensorBase &output, + const TensorBase &input, + const TensorBase &grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners); +using backward_2d_fn = void (*) ( + const TensorBase &grad_input, + const TensorBase &grad_grid, + const TensorBase &grad_output, + const TensorBase &input, + const TensorBase &grid, + int64_t interpolation_mode, + int64_t padding_mode, + bool align_corners, + std::array output_mask); +DECLARE_DISPATCH(forward_2d_fn, grid_sampler_2d_cpu_kernel); +DECLARE_DISPATCH(backward_2d_fn, grid_sampler_2d_backward_cpu_kernel); + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/IndexKernelUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/IndexKernelUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..cc19ce995da4a7abb9870b193798ccd4d883c272 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/IndexKernelUtils.h @@ -0,0 +1,88 @@ +#pragma once +#include +#include + +namespace at { +namespace native { + +namespace { +static bool is_constant_index(int ntensor, const int64_t* strides) { + AT_ASSERT(ntensor >= 3); + for (const auto arg : c10::irange(2, ntensor)) { + if (strides[arg] != 0) { + return false; + } + } + return true; +} + + +struct Indexer { + Indexer(int64_t num_indexers, char** indexers, const int64_t* indexer_strides, + IntArrayRef original_sizes, IntArrayRef original_strides) + : num_indexers(num_indexers) + , indexers(indexers) + , indexer_strides(indexer_strides) + , original_strides(original_strides.data()) + , original_sizes(original_sizes.data()) { + AT_ASSERT(static_cast(original_strides.size()) == num_indexers); + AT_ASSERT(static_cast(original_sizes.size()) == num_indexers); + } + + int64_t num_indexers; + char** indexers; + const int64_t* indexer_strides; + const int64_t* original_strides; + const int64_t* original_sizes; + + int64_t get(int64_t idx) { + int64_t offset = 0; + for (const auto j : c10::irange(num_indexers)) { + int64_t value = *(int64_t*)&indexers[j][idx * indexer_strides[j]]; + int64_t size = original_sizes[j]; + TORCH_CHECK_INDEX(value >= -size && value < size, + "index ", value, " is out of bounds for dimension ", j, " with size ", size); + if (value < 0) { + value += size; + } + offset += value * original_strides[j]; + } + return offset; + } +}; +} // anonymous namespace + +template +void cpu_index_kernel(TensorIteratorBase& iter, IntArrayRef index_size, IntArrayRef index_stride, + const func_t& f, bool serial_execution=false) +{ + int ntensor = iter.ntensors(); + // When launch the index parallel version, set a relative small grain size less than the INTERNAL::GRAIN_SIZE + // to make the whole available thread numbers get more balanced work load and a better cache location. + // The grain size here is chosen by the op benchmark to overcome the thread launch overhead + const int index_parallel_grain_size = 3000; + auto loop = [&](char** data, const int64_t* strides, int64_t n) { + auto indexer = Indexer(ntensor - 2, &data[2], &strides[2], index_size, index_stride); + char* dst = data[0]; + char* src = data[1]; + if (is_constant_index(ntensor, strides)) { + // specialization for when every element uses the same index + int64_t offset = indexer.get(0); + for (const auto i : c10::irange(n)) { + f(dst + strides[0] * i, src + strides[1] * i, offset); + } + } else { + for (const auto i : c10::irange(n)) { + int64_t offset = indexer.get(i); + f(dst + strides[0] * i, src + strides[1] * i, offset); + } + } + }; + if (serial_execution) { + iter.serial_for_each(loop, {0, iter.numel()}); + } else { + iter.for_each(loop, index_parallel_grain_size); + } +} +} // at +} // native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/LogAddExp.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/LogAddExp.h new file mode 100644 index 0000000000000000000000000000000000000000..c03cbebafaffbe86317490e1ab168ff960ad9005 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/LogAddExp.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include + +namespace at { namespace native { +inline namespace CPU_CAPABILITY { + +// custom min and max to be used in logcumsumexp for complex arguments +template +std::pair, c10::complex> _logcumsumexp_minmax(c10::complex x, c10::complex y) { + if (at::_isnan(y)) { // either real is nan or imag is nan + return std::make_pair(y, y); + } else if (at::_isnan(x)) { // either real is nan or imag is nan + return std::make_pair(x, x); + } else { + return (x.real() < y.real()) ? std::make_pair(x, y) : std::make_pair(y, x); + } +} + +template +scalar_t _log_add_exp_helper(scalar_t x, scalar_t y) { + // Reference : https://www.tensorflow.org/api_docs/python/tf/math/cumulative_logsumexp + scalar_t min = at::_isnan(y) ? y : std::min(x, y); // std::min returns first arg if one of the args is nan + scalar_t max = at::_isnan(y) ? y : std::max(x, y); // std::max returns first arg if one of the args is nan + if (min != max || std::isfinite(min)) { + // nan will be propagated here + return std::log1p(std::exp(min - max)) + max; + } else { + // special case to correctly handle infinite cases + return x; + } +} + +template +c10::complex _log_add_exp_helper(const c10::complex& x, const c10::complex& y) { + auto [min, max] = _logcumsumexp_minmax(x, y); + auto min_real = std::real(min); + auto max_real = std::real(max); + + if (at::_isnan(min)) { // either real is nan or imag is nan + // handling the "infectious" NaNs + return {std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN()}; + } else if (!std::isfinite(min_real) && (min_real == max_real)) { + if (min_real < 0) { + // handle the -inf case, the imaginary part here does not really matter as the exp(value) + // will be around 0.0 and the angle (i.e. the imaginary part) cannot be determined. + // It does not matter if we're taking the exp of this value + return min; + } else { + // handle the +inf case, we don't need the special precision for log1p for small values + // and to avoid producing nan in case of real(max) == real(min) == +inf + return std::log(std::exp(min) + std::exp(max)); + } + } else { + return std::log1p(std::exp(min - max)) + max; + } +} + +} // end namespace +}} //end at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/PixelShuffleKernel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/PixelShuffleKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..c015e674a24c597aae9475995612a93271c5ce72 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/PixelShuffleKernel.h @@ -0,0 +1,14 @@ +#pragma once +#include + +namespace at { +class TensorBase; +} + +namespace at { namespace native { + +using pixel_shuffle_fn = void(*)(TensorBase&, const TensorBase&, int64_t); +DECLARE_DISPATCH(pixel_shuffle_fn, pixel_shuffle_kernel); +DECLARE_DISPATCH(pixel_shuffle_fn, pixel_unshuffle_kernel); + +}} // at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SampledAddmmKernel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SampledAddmmKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..04dba4b9b61ced5823c5444df7b510b9d28b630b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SampledAddmmKernel.h @@ -0,0 +1,12 @@ +#pragma once + +#include +#include + +namespace at { namespace native { + +using sampled_addmm_sparse_csr_fn = void(*)(const Tensor&, const Tensor&, const Scalar&, const Scalar&, const Tensor&); + +DECLARE_DISPATCH(sampled_addmm_sparse_csr_fn, sampled_addmm_sparse_csr_stub); + +}} // at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/UpSampleKernelAVXAntialias.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/UpSampleKernelAVXAntialias.h new file mode 100644 index 0000000000000000000000000000000000000000..53ee6a603b9f3b4f14b498e92bb94ad28380c45f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/UpSampleKernelAVXAntialias.h @@ -0,0 +1,1376 @@ +/* +The Python Imaging Library (PIL) is + + Copyright © 1997-2011 by Secret Labs AB + Copyright © 1995-2011 by Fredrik Lundh + +Pillow is the friendly PIL fork. It is + + Copyright © 2010-2022 by Alex Clark and contributors + +Like PIL, Pillow is licensed under the open source HPND License +*/ + +// This code is heavily inspired from PILLOW-SIMD's implementation: +// https://github.com/uploadcare/pillow-simd/blob/simd/master/src/libImaging/Resample.c + +#pragma once +#ifdef CPU_CAPABILITY_AVX2 +// TODO: This file only supports AVX2. We could split the AVX kernels into +// smaller logical blocks in order to port them into the Vec.h logic. This would +// allow to support other vectorization architectures and perhaps also support +// the non-vectorized fallback (we'd need to make sure it's not slower than the +// current fallback). + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + + +namespace { + +static inline __m128i mm_cvtsi32_si128(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { + int32_t v; + if (i32_aligned) { + v = *(const int32_t*)ptr; + } else { + std::memcpy(&v, ptr, 4); + } + return _mm_cvtsi32_si128(v); +} + +static inline __m128i mm_cvtepu8_epi32(const uint8_t* C10_RESTRICT ptr, bool i32_aligned) { + return _mm_cvtepu8_epi32(mm_cvtsi32_si128(ptr, i32_aligned)); +} + +static inline void _write_endline_rgb_as_uint32( + uint8_t* C10_RESTRICT output, + uint32_t data +) { + // data is (R G B X), output is (X1 X2 X3 | R1 B1 G1 R2 ...) + // Here we explicitly set X as R1 + uint8_t* data_ptr = reinterpret_cast(&data); + data_ptr[3] = output[3]; + std::memcpy(output, data_ptr, 4); +} + +at::Tensor unpack_rgb(const at::Tensor& packed_tensor) { + // Convert a "packed" tensor (typically RGBRGBRGB if channels_last) into + // RGBARGBARGBA format where A is hard-coded to 0. Each pixel is encoded + // into as 32 bits. This generalizes to num_channels <= 4 and also works for + // non-channels_last tensors. + + const uint8_t* packed = (const uint8_t*)packed_tensor.data_ptr(); + auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2); + auto num_channels = packed_tensor.size(0); + + constexpr int rgba_size = 4; + auto unpacked_tensor = at::empty({rgba_size, packed_tensor.size(1), packed_tensor.size(2)}, at::CPU(at::kByte)); + uint8_t* unpacked = (uint8_t*) unpacked_tensor.data_ptr(); + + auto stride_i = packed_tensor.stride(2); + auto stride_j = packed_tensor.stride(0); + + for (const auto i : c10::irange(num_pixels)) { + for (const auto j : c10::irange(rgba_size)) { + unpacked[rgba_size * i + j] = (j < num_channels) ? packed[stride_i * i + stride_j * j] : 0; + } + } + return unpacked_tensor; +} + +void pack_rgb( + const at::Tensor& unpacked_tensor, // IN + const at::Tensor& packed_tensor // OUT +) { + // Convert from unpacked channels last 3-channels or 4-channels tensor into original data layout. + + uint8_t* unpacked = (uint8_t*)unpacked_tensor.data_ptr(); + uint8_t* packed = (uint8_t*)packed_tensor.data_ptr(); + auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2); + auto num_channels = packed_tensor.size(0); + + auto unpacked_increment = unpacked_tensor.size(0); + auto packed_increment = packed_tensor.stride(2); + auto packed_stride = packed_tensor.stride(0); + + TORCH_INTERNAL_ASSERT(unpacked_increment == 3 || unpacked_increment == 4); + + for (const auto i C10_UNUSED : c10::irange(num_pixels)) { + for (const auto j : c10::irange(num_channels)) { + packed[j * packed_stride] = unpacked[j]; + } + unpacked += unpacked_increment; + packed += packed_increment; + } +} + +void ImagingResampleHorizontalConvolution8u4x( + uint8_t* C10_RESTRICT lineOut0, + uint8_t* C10_RESTRICT lineOut1, + uint8_t* C10_RESTRICT lineOut2, + uint8_t* C10_RESTRICT lineOut3, + int64_t out_xsize, + const uint8_t* C10_RESTRICT lineIn0, + const uint8_t* C10_RESTRICT lineIn1, + const uint8_t* C10_RESTRICT lineIn2, + const uint8_t* C10_RESTRICT lineIn3, + int64_t in_xsize, + const int64_t* idx_ptr_xmin, + const int64_t* idx_ptr_size, + const int16_t* kk, + int kmax, + unsigned int coefs_precision, + int64_t num_channels, + bool is_last_line); + +void ImagingResampleHorizontalConvolution8u( + uint8_t* C10_RESTRICT lineOut, + int64_t out_xsize, + const uint8_t* C10_RESTRICT lineIn, + int64_t in_xsize, + const int64_t* idx_ptr_xmin, + const int64_t* idx_ptr_size, + const int16_t* kk, + int kmax, + unsigned int coefs_precision, + int64_t num_channels, + bool is_last_line); + +void ImagingResampleVerticalConvolution8u( + uint8_t* C10_RESTRICT lineOut, + const uint8_t* C10_RESTRICT lineIn, + int64_t xsize, + int64_t ids_min, + int64_t ids_size, + const int16_t* k, + unsigned int coefs_precision, + int64_t num_channels); + +template +void ImagingResampleHorizontal( + const at::Tensor & unpacked_output, + const at::Tensor & unpacked_input, + int ksize, + const std::vector& horiz_indices_weights, + unsigned int horiz_weights_precision) { + + // Interpolation horizontal pass: we compute x-axis (image width) interpolation outputs. + + // Input data is stored as + // input = [r[0], g[0], b[0], a[0], r[1], g[1], b[1], a[1], r[2], g[2], b[2], a[2], ...] + // Weights are float values computed for each output pixel and rescaled to uint16: + // weights[i] = [w[i, 0], w[i, 1], ..., w[i, K-1]] + // We want to compute the output as following: + // output = [oR[0], oG[0], oB[0], oA[0], oR[1], oG[1], oB[1], oA[1], ...] + // where + // oR[yoffset + i] = r[yoffset + xmin[i]] * w[i, 0] + ... + r[yoffset + xmin[i] + K-1] * w[i, K-1] + // oG[yoffset + i] = g[yoffset + xmin[i]] * w[i, 0] + ... + g[yoffset + xmin[i] + K-1] * w[i, K-1] + // oB[yoffset + i] = b[yoffset + xmin[i]] * w[i, 0] + ... + b[yoffset + xmin[i] + K-1] * w[i, K-1] + // + + // TODO: we may want to merge that into the fallback code (currently called + // basic_loop_aa_horizontal) + // Although this may not be needed if / when we port all this code to use + // Vec.h since this would potentially give us another fall-back implem + + const int16_t* kk = (int16_t*)(horiz_indices_weights[3].data_ptr()); + + auto xout = unpacked_output.size(2); + auto yout = unpacked_output.size(1); + auto xin = unpacked_input.size(2); + TORCH_INTERNAL_ASSERT(num_channels == unpacked_input.size(0)); + + const int64_t* idx_ptr_xmin = horiz_indices_weights[0].data_ptr(); + const int64_t* idx_ptr_size = horiz_indices_weights[1].data_ptr(); + + uint8_t* unpacked_output_p = unpacked_output.data_ptr(); + const uint8_t* unpacked_input_p = unpacked_input.data_ptr(); + + int64_t yy = 0; + auto xout_stride = xout * num_channels; + auto xin_stride = xin * num_channels; + for (; yy < yout - 3; yy += 4) { + ImagingResampleHorizontalConvolution8u4x( + unpacked_output_p + yy * xout_stride, + unpacked_output_p + (yy + 1) * xout_stride, + unpacked_output_p + (yy + 2) * xout_stride, + unpacked_output_p + (yy + 3) * xout_stride, + xout, + unpacked_input_p + yy * xin_stride, + unpacked_input_p + (yy + 1) * xin_stride, + unpacked_input_p + (yy + 2) * xin_stride, + unpacked_input_p + (yy + 3) * xin_stride, + xin, + idx_ptr_xmin, + idx_ptr_size, + kk, + ksize, + horiz_weights_precision, + num_channels, + yy + 3 == yout - 1); + } + for (; yy < yout; yy++) { + ImagingResampleHorizontalConvolution8u( + unpacked_output_p + yy * xout_stride, + xout, + unpacked_input_p + yy * xin_stride, + xin, + idx_ptr_xmin, + idx_ptr_size, + kk, + ksize, + horiz_weights_precision, + num_channels, + yy == yout - 1); + } +} + +void ImagingResampleVertical( + const at::Tensor & unpacked_output, + const at::Tensor & unpacked_input, + int ksize, + const std::vector& vert_indices_weights, + unsigned int vert_weights_precision) { + + // Interpolation vertical pass: we compute y-axis interpolation outputs. + // Input data is stored as + // input = [r[0], g[0], b[0], a[0], r[1], g[1], b[1], a[1], r[2], g[2], b[2], a[2], ...] + // Weights are float values computed for each output pixel and rescaled to uint16: + // weights[i] = [w[i, 0], w[i, 1], ..., w[i, K-1]] + // We want to compute the output as following: + // output = [oR[0], oG[0], oB[0], oA[0], oR[1], oG[1], oB[1], oA[1], ...] + // where + // oR[xoffset + i] = r[xoffset + ymin[i]] * w[i, 0] + ... + r[xoffset + ymin[i] + (K-1) * xsize] * w[i, K-1] + // oG[xoffset + i] = g[xoffset + ymin[i]] * w[i, 0] + ... + g[xoffset + ymin[i] + (K-1) * xsize] * w[i, K-1] + // oB[xoffset + i] = b[xoffset + ymin[i]] * w[i, 0] + ... + b[xoffset + ymin[i] + (K-1) * xsize] * w[i, K-1] + + // TODO: we may want to merge that into the fallback code (currently called + // basic_loop_aa_vertical) + // Although this may not be needed if / when we port all this code to use + // Vec.h since this would potentially give us another fall-back implem + const int16_t* kk = (int16_t*)(vert_indices_weights[3].data_ptr()); + + const int64_t* idx_ptr_xmin = vert_indices_weights[0].data_ptr(); + const int64_t* idx_ptr_size = vert_indices_weights[1].data_ptr(); + + uint8_t* unpacked_output_p = unpacked_output.data_ptr(); + const uint8_t* unpacked_input_p = unpacked_input.data_ptr(); + + auto xout = unpacked_output.size(2); + auto yout = unpacked_output.size(1); + const auto num_channels = unpacked_input.size(0); + TORCH_INTERNAL_ASSERT(num_channels == unpacked_output.size(0)); + + auto xout_stride = xout * num_channels; + for (const auto yy : c10::irange(yout)) { + const auto* k = &kk[yy * ksize]; + auto ids_min = idx_ptr_xmin[yy]; + auto ids_size = idx_ptr_size[yy]; + ImagingResampleVerticalConvolution8u( + unpacked_output_p + yy * xout_stride, + unpacked_input_p, + xout, + ids_min, + ids_size, + k, + vert_weights_precision, + num_channels); + } +} + +// This is the only public entry point in this file. It supports bilinear or bicubic +// mode for uint8 dtype when C <= 4, with or without antialias. The +// implem is based on PIL-SIMD. +// Its equivalent implementation (fallback) for when AVX isn't supported or when +// C > 4 is separable_upsample_generic_Nd_kernel_impl() There are a bunch of +// future improvement that can be done: look for the TODOs in this file. +// For details on how the weights are computed and how the multiplications are +// run on int (instead of float weights), see +// [ Weights computation for uint8_t and multiplication trick ] +// For details on how the AVX kernels are implemented, see +// https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5 +// See also [ Support for antialias=False as a subcase of antialias=True ] to +// learn more about how the antialias=False case is computed. The same holds +// here: all these kernels are general enough to handle an arbitrary number of +// weights, but when aa=False they could be optimized further. +template +void upsample_avx_bilinear_bicubic_uint8( + const at::Tensor& input_, + const at::Tensor& output, + bool align_corners, + const scale_type& scales, + bool antialias) { + auto batch_size = input_.size(0); + auto num_channels = input_.size(1); + auto xin = input_.size(3); + auto yin = input_.size(2); + auto xout = output.size(3); + auto yout = output.size(2); + + if (xin == xout && yin == yout) { + output.copy_(input_); + return; + } + + at::Tensor input = input_; + if (!(input.is_contiguous() || input.is_contiguous(at::MemoryFormat::ChannelsLast))) { + // If input is not contiguous with memory format channels first or channels last, + // we explicitly convert the input to contiguous channels last memory format. + // This simplifies the rest of the code and let us assume that the format is only contiguous channels first or channels last, + // Most tensors going through this `if` block won't need to go through unpacking, but those having C < 3 may + // have to (this means 2 copies are made). We could avoid the extra copy by handling non-contiguous input + // directly within unpack_rgb() and pack_rgb(), but initial attempts showed that this is fairly complex. + input = input.contiguous(at::MemoryFormat::ChannelsLast); + } + + auto need_horizontal = xout != xin; + auto need_vertical = yout != yin; + + int ksize_horiz, ksize_vert; + std::vector horiz_indices_weights, vert_indices_weights; + unsigned int horiz_weights_precision, vert_weights_precision; + + bool skip_unpacking = (num_channels == 3 || num_channels == 4) && input.is_contiguous(at::MemoryFormat::ChannelsLast); + bool skip_packing = (num_channels == 3 || num_channels == 4) && output.is_contiguous(at::MemoryFormat::ChannelsLast); + + if (need_horizontal) { + int interp_dim = 3; + auto stride = (skip_unpacking) ? num_channels : 4; + std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) = + F::compute_index_ranges_int16_weights( + /*input_size=*/xin, + /*output_size=*/xout, + /*stride=*/stride, + /*ndims=*/4, + /*reshape_dim=*/interp_dim, + /*align_corners=*/align_corners, + /*opt_scale=*/scales[interp_dim - 2], + /*antialias=*/antialias, + /*align_i32=*/true); + } + + if (need_vertical) { + int interp_dim = 2; + auto stride = (skip_unpacking) ? num_channels * xout : 4 * xout; + std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) = + F::compute_index_ranges_int16_weights( + /*input_size=*/yin, + /*output_size=*/yout, + /*stride=*/stride, + /*ndims=*/4, + /*reshape_dim=*/interp_dim, + /*align_corners=*/align_corners, + /*opt_scale=*/scales[interp_dim - 2], + /*antialias=*/antialias, + /*align_i32=*/true); + } + + at::Tensor buffer_horiz, buffer_vert; + // Minor optimization: we can avoid allocating an extra buffer if we're performing + // horizontal-only or vertical-only interpolation, and if the tensor doesn't + // need repacking + if (need_horizontal && (need_vertical || !skip_packing)) { + auto c = (skip_unpacking) ? num_channels : 4; + buffer_horiz = at::empty({c, yin, xout}, input.options()); + } + if (need_vertical && !skip_packing) { + auto c = (skip_unpacking) ? num_channels : 4; + buffer_vert = at::empty({c, yout, xout}, input.options()); + } + + for (const auto i : c10::irange(batch_size)) { + + at::Tensor unpacked_input = (skip_unpacking) ? input[i] : unpack_rgb(input[i]); + at::Tensor unpacked_output; + + if (need_horizontal) { + at::Tensor unpacked_output_temp = (need_vertical || !skip_packing) ? buffer_horiz : output[i]; + + if (skip_unpacking && num_channels == 3) { + ImagingResampleHorizontal<3>( + unpacked_output_temp, + unpacked_input, + ksize_horiz, + horiz_indices_weights, + horiz_weights_precision); + } else { + ImagingResampleHorizontal<4>( + unpacked_output_temp, + unpacked_input, + ksize_horiz, + horiz_indices_weights, + horiz_weights_precision); + } + unpacked_output = unpacked_input = unpacked_output_temp; + } + if (need_vertical) { + unpacked_output = (skip_packing) ? output[i] : buffer_vert; + + ImagingResampleVertical( + unpacked_output, + unpacked_input, + ksize_vert, + vert_indices_weights, + vert_weights_precision + ); + } + + TORCH_INTERNAL_ASSERT(unpacked_output.defined()); + + if (!skip_packing) { + pack_rgb(unpacked_output, output[i]); + } + } +} + +void ImagingResampleHorizontalConvolution8u4x( + uint8_t* C10_RESTRICT lineOut0, + uint8_t* C10_RESTRICT lineOut1, + uint8_t* C10_RESTRICT lineOut2, + uint8_t* C10_RESTRICT lineOut3, + int64_t out_xsize, + const uint8_t* C10_RESTRICT lineIn0, + const uint8_t* C10_RESTRICT lineIn1, + const uint8_t* C10_RESTRICT lineIn2, + const uint8_t* C10_RESTRICT lineIn3, + int64_t in_xsize, + const int64_t* idx_ptr_xmin, + const int64_t* idx_ptr_size, + const int16_t* kk, + int kmax, + unsigned int coefs_precision, + int64_t num_channels, + bool is_last_line) { + + // Interpolation horizontal pass processing together 4 vertical lines. + // - Input data format is RGBA or RGB with R,G,B,A being uint8. In case of RGBA + // we can encode 4 values as a single uint32 value. + // - We split the size of weight vector for a given output index as a sum: + // ids_size = num_blocks_4 * 4 + num_blocks_2 * 2 + num_blocks_1. + // - We load and process 4 weights values in a loop ("block 4") then we process 2 weights values + // in another loop ("block 2") and finally we process 1 weights value in the final loop ("block 1"). + + // Define shuffling masks (low/high) for num_channels 4 and 3 + // Mask low casts lower half of each lane to epi16 and reorder RGBARGBA -> RRGGBBAA: + // [r1 g1 b1 a1 r2 g2 b2 a2 ... | R1 G1 B1 A1 R2 G2 B2 A2 ... ] -> + // [r1 0 r2 0 g1 0 g2 0 b1 0 b2 0 a1 0 a2 0 | R1 0 R2 0 G1 0 G2 0 B1 0 B2 0 A1 0 A2 0] + // Mask high casts upper half of each lane to epi16 and reorder RGBARGBA -> RRGGBBAA:: + // [ ... r3 g3 b3 a3 r4 g4 b4 a4 | ... R3 G3 B3 A3 R4 G4 B4 A4 ] -> + // [r3 0 r4 0 g3 0 g4 0 b3 0 b4 0 a3 0 a4 0 | R3 0 R4 0 G3 0 G4 0 B3 0 B4 0 A3 0 A4 0] + + const auto mask_low_c4 = _mm256_set_epi8( + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0, + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0); + const auto mask_high_c4 = _mm256_set_epi8( + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8, + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8); + const auto mask_low_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0, + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0); + const auto mask_high_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6, + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6); + + const auto mask_low = (num_channels == 3) ? mask_low_c3 : mask_low_c4; + const auto mask_high = (num_channels == 3) ? mask_high_c3 : mask_high_c4; + + const auto stride = num_channels * sizeof(uint8_t); + + TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4); + + // out_xsize = output width, out_x = output x index + // ids_min is the input offset index corresponding to out_x + // ids_size is the interpolation size for out_x + + // Let's precompute ids_size limits for block 4 and block 2. + // + // In block 4 (4 means we process 4 weight values together), we read input data + // with _mm_loadu_si128, i.e. 16 bytes, per one line: + // lineIn0 + stride * (i + ids_min) + 16 <= lineIn0 + stride * (ids_size + ids_min) + // --> i <= ids_size - 16.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(16.0 / stride)) = ids_size - b4_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(16.0 / stride) = ids_size - b4_delta_soft + // RGBA: b4_delta = b4_delta_soft = 3 + // RGB : b4_delta = 5 + // RGB : b4_delta_soft = 4 + const auto b4_delta = (stride == 4) ? 3 : ((is_last_line) ? 5 : 4); + + // In block 2 (2 means we process 2 weights values together), we read input data + // with _mm_loadl_epi64, i.e. 8 bytes, per one line: + // lineIn0 + stride * (i + ids_min) + 8 <= lineIn0 + stride * (ids_size + ids_min) + // --> i <= ids_size - 8.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(8.0 / stride)) = ids_size - b2_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(8.0 / stride) = ids_size - b2_delta_soft + // RGBA: b2_delta = b2_delta_soft = 1 + // RGB : b2_delta = 2 + // RGB : b2_delta_soft = 1 + const auto b2_delta = (stride == 4) ? 1 : ((is_last_line) ? 2 : 1); + + const auto max_out_x_strided = out_xsize * stride; + const auto max_in_x_strided = in_xsize * stride; + + const auto zero = _mm256_setzero_si256(); + const auto initial = _mm256_set1_epi32(1 << (coefs_precision - 1)); + + for (const auto out_x : c10::irange(out_xsize)) { + const auto ids_min = idx_ptr_xmin[out_x]; + const auto ids_size = idx_ptr_size[out_x]; + const auto * k = &kk[out_x * kmax]; + int64_t i = 0; + + auto sss0 = initial; + auto sss1 = initial; + + const auto * lineIn0_min = lineIn0 + ids_min; + const auto * lineIn1_min = lineIn1 + ids_min; + const auto * lineIn2_min = lineIn2 + ids_min; + const auto * lineIn3_min = lineIn3 + ids_min; + + // block 4 + for (; i < ids_size - b4_delta; i += 4) { + // Load 4 values from weight vector + // mmk0 = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ...] + // mmk1 = [wl_2 wh_2 wl_3 wh_3 wl_2 wh_2 wl_3 wh_3 ...] + const auto mmk0 = _mm256_set1_epi32(*(int32_t*)&k[i]); + const auto mmk1 = _mm256_set1_epi32(*(int32_t*)&k[i + 2]); + + // RGBA: Load 8 pixels (4 per line) from input lines 0 and 1: + // source = [ + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // R0 G0 B0 A0 R1 G1 B1 A1 R2 G2 B2 A2 R3 G3 B3 A3 + // ] + // RGB: Load 10 pixels (5 per line) + // source = [ + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // R0 G0 B0 R1 G1 B1 R2 G2 B2 R3 G3 B3 R4 G4 B4 R5 + // ] + auto source = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) (lineIn0_min + stride * i))), + _mm_loadu_si128((__m128i *) (lineIn1_min + stride * i)), 1); + + // Apply mask_low: + // RGBA: + // [r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0 | R0 0 R1 0 G0 0 G1 0 B0 0 B1 0 A0 0 A1 0] + // RGB: + // [r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 0 0 0 0 | R0 0 R1 0 G0 0 G1 0 B0 0 B1 0 0 0 0 0] + auto pix1 = _mm256_shuffle_epi8(source, mask_low); + // Compute output value as C += w0 * C0 + w1 * C1 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk0)); + + // Apply mask_high: + // RGBA: + // [r2 0 r3 0 g2 0 g3 0 b2 0 b3 0 a2 0 a3 0 | R2 0 R3 0 G2 0 G3 0 B2 0 B3 0 A2 0 A3 0] + // RGB: + // [r2 0 r3 0 g2 0 g3 0 b2 0 b3 0 0 0 0 0 | R2 0 R3 0 G2 0 G3 0 B2 0 B3 0 0 0 0 0] + auto pix2 = _mm256_shuffle_epi8(source, mask_high); + // Compute output value as C += w2 * C2 + w3 * C3 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix2, mmk1)); + + // Same as above to next lines 2 and 3: + auto source2 = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) (lineIn2_min + stride * i))), + _mm_loadu_si128((__m128i *) (lineIn3_min + stride * i)), 1); + auto pix3 = _mm256_shuffle_epi8(source2, mask_low); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix3, mmk0)); + auto pix4 = _mm256_shuffle_epi8(source2, mask_high); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix4, mmk1)); + } + + // block 2 + for (; i < ids_size - b2_delta; i += 2) { + // Load 2 values from weight vector + // mmk = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ...] + const auto mmk = _mm256_set1_epi32(*(int32_t*)&k[i]); + + // Load 4 pixels (2 per line) from input lines 0 and 1: + // RGBA: source1 = [ + // r0 g0 b0 a0 r1 g1 b1 a1 0 0 0 0 0 0 0 0 + // R0 G0 B0 A0 R1 G1 B1 A1 0 0 0 0 0 0 0 0 + // ] + // RGB: source1 = [ + // r0 g0 b0 r1 g1 b1 r2 0 0 0 0 0 0 0 0 + // R0 G0 B0 R1 G1 B1 R2 0 0 0 0 0 0 0 0 + // ] + auto source1 = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *) (lineIn0_min + stride * i))), + _mm_loadl_epi64((__m128i *) (lineIn1_min + stride * i)), 1); + // Apply mask_low: + // RGBA: + // [r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0 | R0 0 R1 0 G0 0 G1 0 B0 0 B1 0 A0 0 A1 0] + // RGB: + // [r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 0 0 0 0 | R0 0 R1 0 G0 0 G1 0 B0 0 B1 0 0 0 0 0] + auto pix1 = _mm256_shuffle_epi8(source1, mask_low); + // Compute output value as C += w0 * C0 + w1 * C1 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk)); + + // Same as above for lines 2 and 3: + auto source2 = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadl_epi64((__m128i *) (lineIn2_min + stride * i))), + _mm_loadl_epi64((__m128i *) (lineIn3_min + stride * i)), 1); + auto pix2 = _mm256_shuffle_epi8(source2, mask_low); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + } + + // block 1 + const auto i32_aligned = num_channels == 4; + for (; i < ids_size - 1; i++) { + // Load 1 value from weight vector + // mmk = [wl_0 wh_0 0 0 wl_0 wh_0 0 0 ...] + const auto mmk = _mm256_set1_epi32(k[i]); + + // Load 2 pixels (one per line) from input lines 0 and 1: + // RGBA: pix1 = [ + // r0 0 0 0 g0 0 0 0 b0 0 0 0 a0 0 0 0 + // R0 0 0 0 G0 0 0 0 B0 0 0 0 A0 0 0 0 + // ] + // RGB: pix1 = [ + // r0 0 0 0 g0 0 0 0 b0 0 0 0 r1 0 0 0 + // R0 0 0 0 G0 0 0 0 B0 0 0 0 R1 0 0 0 + // ] + auto pix1 = _mm256_inserti128_si256(_mm256_castsi128_si256( + mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)), + mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1); + // Compute output value as C += w0 * C0 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk)); + + // Same as above for lines 2 and 3 + auto pix2 = _mm256_inserti128_si256(_mm256_castsi128_si256( + mm_cvtepu8_epi32(lineIn2_min + stride * i, i32_aligned)), + mm_cvtepu8_epi32(lineIn3_min + stride * i, i32_aligned), 1); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + } + + if (i == ids_size - 1) { + // last element + auto mmk = _mm256_set1_epi32(k[i]); + // For num_channels == 3 (3 bytes = one pixel) we tolerate to read 4 bytes + // lines 0, 1 and 2 wont go out of allocated memory bounds + auto pix = _mm256_inserti128_si256(_mm256_castsi128_si256( + mm_cvtepu8_epi32(lineIn0_min + stride * i, i32_aligned)), + mm_cvtepu8_epi32(lineIn1_min + stride * i, i32_aligned), 1); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix, mmk)); + + auto p0 = mm_cvtepu8_epi32(lineIn2_min + stride * i, i32_aligned); + __m128i p1; + if (num_channels == 3 && C10_UNLIKELY(is_last_line && ids_min + stride * i + 4 >= max_in_x_strided)) { + uint8_t input[4]; + std::memcpy(input, lineIn3_min + stride * i, 3); + p1 = mm_cvtepu8_epi32(input, true); + } else { + p1 = mm_cvtepu8_epi32(lineIn3_min + stride * i, i32_aligned); + } + auto pix2 = _mm256_inserti128_si256(_mm256_castsi128_si256(p0), p1, 1); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + } + + // Convert fixed point values back to integers (truncating) + sss0 = _mm256_srai_epi32(sss0, coefs_precision); + sss1 = _mm256_srai_epi32(sss1, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d 0 0 0 0 0 0 0 0) + sss0 = _mm256_packs_epi32(sss0, zero); + sss1 = _mm256_packs_epi32(sss1, zero); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d 0 0 0 0) + sss0 = _mm256_packus_epi16(sss0, zero); + sss1 = _mm256_packus_epi16(sss1, zero); + + // Write the output into single uint32 + // (a b c d) -> x_uint32 + auto o0 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sss0)); + auto o1 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss0, 1)); + auto o2 = _mm_cvtsi128_si32(_mm256_castsi256_si128(sss1)); + auto o3 = _mm_cvtsi128_si32(_mm256_extracti128_si256(sss1, 1)); + + const auto out_x_strided = stride * out_x; + + if (num_channels == 3 && C10_UNLIKELY(out_x_strided + 4 >= max_out_x_strided)) { + // Memcpy 4-bytes is faster than 3-bytes and this is a boundary case when we want to write + // 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1). + // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct + // value which was previously computed by another line. In other words, it means that we can not overwrite + // it by simply writing 4 bytes from the register to the output. We'll do the following: + // v----------| + // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...] + // First, we write R1 value to the 4th byte of (R G B | X) -> (R G B | R1) + // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | R1) + // Output = [... R G B | R1 G1 B1 R2 ...] + + _write_endline_rgb_as_uint32(lineOut0 + out_x_strided, o0); + _write_endline_rgb_as_uint32(lineOut1 + out_x_strided, o1); + _write_endline_rgb_as_uint32(lineOut2 + out_x_strided, o2); + + if (C10_UNLIKELY(is_last_line)) { + // When we handle the last line, we can not access the next 4 bytes + // as they are out of memory bounds. + std::memcpy(lineOut3 + out_x_strided, (uint8_t *) &o3, num_channels); + } else { + _write_endline_rgb_as_uint32(lineOut3 + out_x_strided, o3); + } + } else if (num_channels == 3) { + // Memcpy 4-bytes is faster than 3-bytes and here + // we simply write 4 bytes (... R G B X 0 0 0 0 0 ...) where X is a garbage value + // that we will overwrite on the next iteration: (... R G B R G B X 0 0 ...) + std::memcpy(lineOut0 + out_x_strided, (uint8_t *) &o0, 4); + std::memcpy(lineOut1 + out_x_strided, (uint8_t *) &o1, 4); + std::memcpy(lineOut2 + out_x_strided, (uint8_t *) &o2, 4); + std::memcpy(lineOut3 + out_x_strided, (uint8_t *) &o3, 4); + } else { + // num_channels = 4 -> lineOutX + out_x_strided should be uint32 aligned + *(uint32_t *)(lineOut0 + out_x_strided) = o0; + *(uint32_t *)(lineOut1 + out_x_strided) = o1; + *(uint32_t *)(lineOut2 + out_x_strided) = o2; + *(uint32_t *)(lineOut3 + out_x_strided) = o3; + } + } +} + +void ImagingResampleHorizontalConvolution8u( + uint8_t* C10_RESTRICT lineOut, + int64_t out_xsize, + const uint8_t* C10_RESTRICT lineIn, + int64_t in_xsize, + const int64_t* idx_ptr_xmin, + const int64_t* idx_ptr_size, + const int16_t* kk, + int kmax, + unsigned int coefs_precision, + int64_t num_channels, + bool is_last_line) { + + // Interpolation horizontal pass processing only one vertical line. + // - Input data format is RGBA or RGB with R,G,B,A being uint8. In case of RGBA + // we can encode 4 values as a single uint32 value. + // - We split the size of weight vector for a given output index as a sum: + // ids_size = num_blocks_8 * 8 + num_blocks_4 * 4 + num_blocks_2 * 2 + num_blocks_1 + // - We load and process 8 weights values in a loop ("block 8") then 4 weights and 2 weights values in + // in another loops ("block 4" and "block 2") and finally we process 1 weight value in the final loop ("block 1"). + + // Define various shuffling masks + const auto kmask_low = _mm256_set_epi8( + 11, 10, 9, 8, 11, 10, 9, 8, 11, 10, 9, 8, 11, 10, 9, 8, + 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); + const auto kmask_high = _mm256_set_epi8( + 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, + 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4); + const auto kmask_hl = _mm256_set_epi8( + 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, 7, 6, 5, 4, + 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); + + const auto mask_low_c4 = _mm256_set_epi8( + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0, + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0); + const auto mask_high_c4 = _mm256_set_epi8( + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8, + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8); + const auto mask_low_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0, + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0); + const auto mask_high_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6, + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6); + const auto mask_hl_c3 = _mm256_set_epi8( + -1, -1, -1, -1, -1, 11, -1, 8, -1, 10, -1, 7, -1, 9, -1, 6, + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0); + const auto mask_hl_c4 = _mm256_set_epi8( + -1, 15, -1, 11, -1, 14, -1, 10, -1, 13, -1, 9, -1, 12, -1, 8, + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0); + + const auto mask_low128_c3 = _mm_set_epi8( + -1, -1, -1, -1, -1, 5, -1, 2, -1, 4, -1, 1, -1, 3, -1, 0); + const auto mask_low128_c4 = _mm_set_epi8( + -1, 7, -1, 3, -1, 6, -1, 2, -1, 5, -1, 1, -1, 4, -1, 0); + + const auto mask_low = (num_channels == 3) ? mask_low_c3 : mask_low_c4; + const auto mask_high = (num_channels == 3) ? mask_high_c3 : mask_high_c4; + const auto mask_hl = (num_channels == 3) ? mask_hl_c3 : mask_hl_c4; + const auto mask_low128 = (num_channels == 3) ? mask_low128_c3 : mask_low128_c4; + + // out_xsize = output width, out_x = output x index + // ids_min is the input offset index corresponding to out_x + // ids_size is the interpolation size for out_x + + const auto stride = num_channels * sizeof(uint8_t); + const auto zero = _mm_setzero_si128(); + + TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4); + + // Let's precompute ids_size limits for block 8, block 4 and block 2 + // + // In block 8 (8 means we process 8 weight values together), we read at + // most 32 bytes input data (16 + 16 bytes for RGBA and 12 + 16 bytes for RGB) + // lineIn + stride * (i + ids_min) + 32 <= lineIn + stride * (ids_size + ids_min) + // --> i <= ids_size - 32.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(32.0 / stride)) = ids_size - b8_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(32.0 / stride) = ids_size - b8_delta_soft + // RGBA: b8_delta = b8_delta_soft = 7 + // RGB : b8_delta = 10 + // RGB : b8_delta_soft = 9 + const auto b8_delta = (stride == 4) ? 7 : ((is_last_line) ? 10 : 9); + + // In block 4 (4 means we process 4 weight values together), we read + // 16 bytes of input data. + // lineIn + stride * (i + ids_min) + 16 <= lineIn0 + stride * (ids_size + ids_min) + // --> i <= ids_size - 16.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(16.0 / stride)) = ids_size - b4_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(16.0 / stride) = ids_size - b4_delta_soft + // RGBA: b4_delta = b4_delta_soft = 3 + // RGB : b4_delta = 5 + // RGB : b4_delta_soft = 4 + const auto b4_delta = (stride == 4) ? 3 : ((is_last_line) ? 5 : 4); + + // In block 2 (2 means we process 2 weight values together), we read + // 8 bytes of input data. + // lineIn0 + stride * (i + ids_min) + 8 <= lineIn0 + stride * (ids_size + ids_min) + // --> i <= ids_size - 8.0 / stride + // Strict boundary: + // --> i < ids_size + 1 - int(ceil(8.0 / stride)) = ids_size - b2_delta + // Soft boundary for reading inside the buffer except its boundaries: + // --> i < ids_size + 1 - int(8.0 / stride) = ids_size - b2_delta_soft + // RGBA: b2_delta = b2_delta_soft = 1 + // RGB : b2_delta = 2 + // RGB : b2_delta_soft = 1 + const auto b2_delta = (stride == 4) ? 1 : ((is_last_line) ? 2 : 1); + + const auto max_out_x_strided = out_xsize * stride; + const auto max_in_x_strided = in_xsize * stride; + + for (const auto out_x : c10::irange(out_xsize)) { + __m128i sss; + const auto ids_min = idx_ptr_xmin[out_x]; + const auto ids_size = idx_ptr_size[out_x]; + const auto * k = &kk[out_x * kmax]; + int64_t i = 0; + + const auto * lineIn_min = lineIn + ids_min; + + if (ids_size < 8) { + sss = _mm_set1_epi32(1 << (coefs_precision - 1)); + } else { + // Lower part will be added to higher, use only half of the error + auto sss256 = _mm256_set1_epi32(1 << (coefs_precision - 2)); + + // block 8 + for (; i < ids_size - b8_delta; i += 8) { + // Load 8 values from weight vector + auto tmp = _mm_loadu_si128((__m128i*)&k[i]); + // ksource = [ + // wl_0 wh_0 wl_1 wh_1 wl_2 wh_2 wl_3 wh_3 wl_4 wh_4 wl_5 wh_5 wl_6 wh_6 wl_7 wh_7 + // wl_0 wh_0 wl_1 wh_1 wl_2 wh_2 wl_3 wh_3 wl_4 wh_4 wl_5 wh_5 wl_6 wh_6 wl_7 wh_7 + // ] + auto ksource = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1); + + // RGBA: Load 8 pixels from input: + // source = [ + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // r4 g4 b4 a4 r5 g5 b5 a5 r6 g6 b6 a6 r7 g7 b7 a7 + // ] + // RGB: Load 10 pixels from input (however we can process only 8 pixels): + // source = [ + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // r4 g4 b4 r5 g5 b5 r6 g6 b6 r7 g7 b7 r8 g8 b8 r9 + // ] + auto source = _mm256_inserti128_si256(_mm256_castsi128_si256( + _mm_loadu_si128((__m128i *) (lineIn_min + stride * i))), + _mm_loadu_si128((__m128i *) (lineIn_min + stride * (i + 4))), 1); + + // Extract lower part of each lane, cast to epi16 and reoder RGBARGBA -> RRGGBBAA + // RGBA: pix1 = [ + // r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0 + // r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 a4 0 a5 0 + // ] + // RGB: pix1 = [ + // r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 0 0 0 0 + // r4 0 r5 0 g4 0 g5 0 b4 0 b5 0 0 0 0 0 + // ] + auto pix1 = _mm256_shuffle_epi8(source, mask_low); + // mmk1 = [ + // wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ... ... + // wl_4 wh_4 wl_5 wh_5 wl_4 wh_4 wl_5 wh_5 ... ... + // ] + auto mmk1 = _mm256_shuffle_epi8(ksource, kmask_low); + // Compute output value as + // C += w0 * C0 + w1 * C1 + // C += w4 * C4 + w5 * C5 for each channel in 32-bit precision + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix1, mmk1)); + + // Same as above for higher part of each lane + auto pix2 = _mm256_shuffle_epi8(source, mask_high); + auto mmk2 = _mm256_shuffle_epi8(ksource, kmask_high); + // Compute output value as + // C += w2 * C2 + w3 * C3 + // C += w6 * C6 + w7 * C7 for each channel in 32-bit precision + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix2, mmk2)); + } + + // block 4 + for (; i < ids_size - b4_delta; i += 4) { + // Load 4 values from weight vector + auto tmp = _mm_loadl_epi64((__m128i *) &k[i]); + // ksource = [ + // wl_0 wh_0 wl_1 wh_1 wl_2 wh_2 wl_3 wh_3 0 0 0 0 0 0 0 0 + // wl_0 wh_0 wl_1 wh_1 wl_2 wh_2 wl_3 wh_3 0 0 0 0 0 0 0 0 + // ] + auto ksource = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1); + + // Load pixels from input line + tmp = _mm_loadu_si128((__m128i *) (lineIn_min + stride * i)); + // RGBA: source = [ + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // ] + // RGB: source = [ + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // ] + auto source = _mm256_insertf128_si256(_mm256_castsi128_si256(tmp), tmp, 1); + + // Cast source to epi16 and reorder RGBARGBA -> RRGGBBAA + // RGBA: pix = [ + // r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 a0 0 a1 0 + // r2 0 r3 0 g2 0 g3 0 b2 0 b3 0 a2 0 a3 0 + // ] + // RGB: pix = [ + // r0 0 r1 0 g0 0 g1 0 b0 0 b1 0 0 0 0 0 + // r2 0 r3 0 g2 0 g3 0 b2 0 b3 0 0 0 0 0 + // ] + auto pix = _mm256_shuffle_epi8(source, mask_hl); + // mmk = [ + // wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ... ... + // wl_2 wh_2 wl_3 wh_3 wl_2 wh_2 wl_3 wh_3 ... ... + // ] + auto mmk = _mm256_shuffle_epi8(ksource, kmask_hl); + // Compute output value as + // C += w0 * C0 + w1 * C1 + // C += w2 * C2 + w3 * C3 for each channel in 32-bit precision + sss256 = _mm256_add_epi32(sss256, _mm256_madd_epi16(pix, mmk)); + } + + // Sum results between the lanes + sss = _mm_add_epi32( + _mm256_extracti128_si256(sss256, 0), + _mm256_extracti128_si256(sss256, 1)); + } + + // block 2 + for (; i < ids_size - b2_delta; i += 2) { + // Load 2 values from weight vector + // mmk = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ...] + auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]); + // Load pixels from input line + // RGBA: source = [ + // r0 g0 b0 a0 r1 g1 b1 a1 0 0 0 0 0 0 0 0 + // ] + // RGB: source = [ + // r0 g0 b0 r1 g1 b1 r2 g2 0 0 0 0 0 0 0 0 + // ] + auto source = _mm_loadl_epi64((__m128i *) (lineIn_min + stride * i)); + // Cast source to epi16 and reorder RGBARGBA -> RRGGBBAA + auto pix = _mm_shuffle_epi8(source, mask_low128); + // Compute output value as C += w0 * C0 + w1 * C1 for each channel in 32-bit precision + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + // block 1 + const auto i32_aligned = num_channels == 4; + for (; i < ids_size - 1; i++) { + // Load 1 value from weight vector + // mmk = [wl_0 wh_0 0 0 wl_0 wh_0 0 0 ...] + auto mmk = _mm_set1_epi32(k[i]); + // Load one pixel from input line + // RGBA: pix = [ + // r0 0 0 0 g0 0 0 0 b0 0 0 0 a0 0 0 0 + // ] + // RGB: pix = [ + // r0 0 0 0 g0 0 0 0 b0 0 0 0 r1 0 0 0 + // ] + auto pix = mm_cvtepu8_epi32(lineIn_min + stride * i, i32_aligned); + // Compute output value as C += w0 * C0 for each channel in 32-bit precision + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + if (i == ids_size - 1) { + // last element + auto mmk = _mm_set1_epi32(k[i]); + __m128i pix; + auto p = lineIn_min + stride * i; + if (num_channels == 3 && C10_UNLIKELY(is_last_line && ids_min + stride * i + 4 >= max_in_x_strided)) { + uint8_t input[4]; + std::memcpy(input, p, 3); + pix = mm_cvtepu8_epi32(input, true); + } else { + pix = mm_cvtepu8_epi32(p, i32_aligned); + } + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + // Convert fixed point values back to integers (truncating) + sss = _mm_srai_epi32(sss, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d 0 0 0 0 0 0 0 0) + sss = _mm_packs_epi32(sss, zero); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d 0 0 0 0) + sss = _mm_packus_epi16(sss, zero); + // Write the output into single uint32 + // (a b c d) -> x_uint32 + auto o = _mm_cvtsi128_si32(sss); + const auto out_x_strided = stride * out_x; + if (num_channels == 3 && C10_UNLIKELY(out_x_strided + 4 >= max_out_x_strided)) { + if (C10_UNLIKELY(is_last_line)) { + // When we handle the last line, we can not access the next 4 bytes + // as they are out of memory bounds. + std::memcpy(lineOut + out_x_strided, (uint8_t *) &o, 3); + } else { + // Memcpy 4-bytes is faster than 3-bytes and this is a boundary case when we want to write + // 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1). + // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct + // value which was previously computed by another line. In other words, it means that we can not overwrite + // it by simply writing 4 bytes from the register to the output. We'll do the following: + // v----------| + // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...] + // First, we write R1 value to the 4th byte of (R G B | X) -> (R G B | R1) + // Second, we write 4 bytes from the register to the output: (X1 X2 X3 | R1) -> (R G B | R1) + // Output = [... R G B | R1 G1 B1 R2 ...] + _write_endline_rgb_as_uint32(lineOut + out_x_strided, o); + } + } else if (num_channels == 3) { + // Memcpy 4-bytes is faster than 3-bytes and here + // we simply write 4 bytes (... R G B X 0 0 0 0 0 ...) where X is a garbage value + // that we will overwrite on the next iteration: (... R G B R G B X 0 0 ...) + std::memcpy(lineOut + out_x_strided, (uint8_t *) &o, 4); + } else { + // num_channels = 4 -> lineOut + out_x_strided should be uint32 aligned + *(uint32_t *)(lineOut + out_x_strided) = o; + } + } +} + +void ImagingResampleVerticalConvolution8u( + uint8_t* C10_RESTRICT lineOut, + const uint8_t* C10_RESTRICT lineIn, + int64_t xsize, + int64_t ids_min, + int64_t ids_size, + const int16_t* k, + unsigned int coefs_precision, + int64_t num_channels) { + + // Interpolation vertical pass processing one line. + // - We process x-axis data with blocks of 8, 2 and 1 + // - We split the size of weight vector for a given output index as a sum: K = n * 2 + m. + + // xsize = output width, also equals to input width + // ids_size = interpolation size + // ids_min = input y start index + const auto stride = num_channels * sizeof(uint8_t); + + TORCH_INTERNAL_ASSERT(stride == 3 || stride == 4); + + const int64_t data_size = xsize * stride; + const int64_t data_stride = stride; + constexpr auto vec_size = 256 / 8; + + const auto initial = _mm_set1_epi32(1 << (coefs_precision - 1)); + const auto initial_256 = _mm256_set1_epi32(1 << (coefs_precision - 1)); + const auto zero = _mm_setzero_si128(); + const auto zero_256 = _mm256_setzero_si256(); + + int64_t j = 0; + // block 8 + const auto b8_usable_vec_stride = (vec_size / data_stride) * data_stride; + for (; j < data_size - vec_size; j += b8_usable_vec_stride) { + auto sss0 = initial_256; + auto sss1 = initial_256; + auto sss2 = initial_256; + auto sss3 = initial_256; + int64_t i = 0; + const auto * lineIn_min = lineIn + j + ids_min; + + for (; i < ids_size - 1; i += 2) { + // Load 2 values from weight vector + auto mmk = _mm256_set1_epi32(*(int32_t*)&k[i]); + + // RGBA: Load 8 pixels per line + // source1 = [ + // r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 + // r4 g4 b4 a4 r5 g5 b5 a5 r6 g6 b6 a6 r7 g7 b7 a7 + // ] + // RGB: Load 10 pixels per line (however we can process only 8 pixels): + // source1 = [ + // r0 g0 b0 r1 g1 b1 r2 g2 b2 r3 g3 b3 r4 g4 b4 r5 + // r4 g4 b4 r5 g5 b5 r6 g6 b6 r7 g7 b7 r8 g8 b8 r9 + // ] + auto source1 = + _mm256_loadu_si256((__m256i*)(lineIn_min + data_size * i)); + auto source2 = + _mm256_loadu_si256((__m256i*)(lineIn_min + data_size * (i + 1))); + + // Interleave source1 and source2 from the low half of each 128-bit lane + // and cast the result to epi16 + // RGBA: pix1 = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 a0 0 A0 0 + // r1 0 R1 0 g1 0 G1 0 b1 0 B1 0 a1 0 A1 0 + // ] + // RGB: pix1 = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 0 0 0 0 + // r1 0 R1 0 g1 0 G1 0 b1 0 B1 0 0 0 0 0 + // ] + auto source_lo = _mm256_unpacklo_epi8(source1, source2); + auto pix1 = _mm256_unpacklo_epi8(source_lo, zero_256); + // Compute output value as + // C += w0 * c0 + w1 * C0 + // C += w0 * c1 + w1 * C1 for each channel in 32-bit precision + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk)); + + // RGBA: pix2 = [ + // r2 0 R2 0 g2 0 G2 0 b2 0 B2 0 a2 0 A2 0 + // r3 0 R3 0 g3 0 G3 0 b3 0 B3 0 a3 0 A3 0 + // ] + // RGB: pix2 = [ + // r2 0 R2 0 g2 0 G2 0 b2 0 B2 0 0 0 0 0 + // r3 0 R3 0 g3 0 G3 0 b3 0 B3 0 0 0 0 0 + // ] + auto pix2 = _mm256_unpackhi_epi8(source_lo, zero_256); + // Compute output value as + // C += w0 * c2 + w1 * C2 + // C += w0 * c3 + w1 * C3 for each channel in 32-bit precision + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + + // Same as above for the high half of each 128-bit lane + auto source_hi = _mm256_unpackhi_epi8(source1, source2); + auto pix3 = _mm256_unpacklo_epi8(source_hi, zero_256); + sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix3, mmk)); + auto pix4 = _mm256_unpackhi_epi8(source_hi, zero_256); + sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix4, mmk)); + } + // Same processing as above but with a single weight value + for (; i < ids_size; i += 1) { + auto mmk = _mm256_set1_epi32(k[i]); + + auto source1 = _mm256_loadu_si256((__m256i*)(lineIn_min + i * data_size)); + + auto source_lo = _mm256_unpacklo_epi8(source1, zero_256); + auto pix1 = _mm256_unpacklo_epi8(source_lo, zero_256); + sss0 = _mm256_add_epi32(sss0, _mm256_madd_epi16(pix1, mmk)); + auto pix2 = _mm256_unpackhi_epi8(source_lo, zero_256); + sss1 = _mm256_add_epi32(sss1, _mm256_madd_epi16(pix2, mmk)); + + auto source_hi = _mm256_unpackhi_epi8(source1, zero_256); + auto pix3 = _mm256_unpacklo_epi8(source_hi, _mm256_setzero_si256()); + sss2 = _mm256_add_epi32(sss2, _mm256_madd_epi16(pix3, mmk)); + auto pix4 = _mm256_unpackhi_epi8(source_hi, _mm256_setzero_si256()); + sss3 = _mm256_add_epi32(sss3, _mm256_madd_epi16(pix4, mmk)); + } + // Convert fixed point values back to integers (truncating) + sss0 = _mm256_srai_epi32(sss0, coefs_precision); + sss1 = _mm256_srai_epi32(sss1, coefs_precision); + sss2 = _mm256_srai_epi32(sss2, coefs_precision); + sss3 = _mm256_srai_epi32(sss3, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d) + sss0 = _mm256_packs_epi32(sss0, sss1); + sss2 = _mm256_packs_epi32(sss2, sss3); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d) + sss0 = _mm256_packus_epi16(sss0, sss2); + + // Stores 32 bytes + _mm256_storeu_si256((__m256i*)(lineOut + j), sss0); + } + + // TODO: Do we also need block 4 ??? + // block 2 + const auto b2_usable_vec_stride = (8 / data_stride) * data_stride; + for (; j < data_size - vec_size / 4; j += b2_usable_vec_stride) { + auto sss0 = initial; + auto sss1 = initial; + int64_t i = 0; + const auto * lineIn_min = lineIn + j + ids_min; + + for (; i < ids_size - 1; i += 2) { + // Load 2 values from weight vector + // mmk = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ... ] + auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]); + + // Load 2 pixels per line + // RGBA: source1 = [ + // r0 g0 b0 a0 r1 g1 b1 a1 0 0 0 0 0 0 0 0 + // ] + // RGB: source1 = [ + // r0 g0 b0 r1 g1 b1 r2 g2 0 0 0 0 0 0 0 0 + // ] + auto source1 = _mm_loadl_epi64((__m128i *) (lineIn_min + i * data_size)); + auto source2 = _mm_loadl_epi64((__m128i *) (lineIn_min + (i + 1) * data_size)); + // Interleave source1 and source2 and cast the result to epi16 + // RGBA: pix = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 a0 0 A0 0 + // ] + // RGB: pix = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 0 0 0 0 + // ] + auto source = _mm_unpacklo_epi8(source1, source2); + auto pix = _mm_unpacklo_epi8(source, zero); + // Compute output value as C += w0 * c0 + w1 * C0 for each channel in 32-bit precision + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix, mmk)); + // RGBA: pix = [ + // r1 0 R1 0 g1 0 G1 0 b1 0 B1 0 a1 0 A1 0 + // ] + // RGB: pix = [ + // r1 0 R1 0 g1 0 G1 0 b1 0 B1 0 0 0 0 0 + // ] + pix = _mm_unpackhi_epi8(source, zero); + // Compute output value as C += w0 * c1 + w1 * C1 for each channel in 32-bit precision + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix, mmk)); + } + // Same processing as above but with a single weight value + for (; i < ids_size; i += 1) { + auto mmk = _mm_set1_epi32(k[i]); + + auto source1 = _mm_loadl_epi64((__m128i*) (lineIn_min + i * data_size)); + + auto source = _mm_unpacklo_epi8(source1, zero); + auto pix1 = _mm_unpacklo_epi8(source, zero); + sss0 = _mm_add_epi32(sss0, _mm_madd_epi16(pix1, mmk)); + auto pix2 = _mm_unpackhi_epi8(source, zero); + sss1 = _mm_add_epi32(sss1, _mm_madd_epi16(pix2, mmk)); + } + // Convert fixed point values back to integers (truncating) + sss0 = _mm_srai_epi32(sss0, coefs_precision); + sss1 = _mm_srai_epi32(sss1, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d) + sss0 = _mm_packs_epi32(sss0, sss1); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d) + sss0 = _mm_packus_epi16(sss0, sss0); + // Store 2 pixels to the output + _mm_storel_epi64((__m128i*)(lineOut + j), sss0); + } + + // block 1 + const auto b1_usable_vec_stride = (4 / data_stride) * data_stride; + const auto i32_aligned = num_channels == 4; + for (; j < data_size - 4; j += b1_usable_vec_stride) { + auto sss = initial; + int64_t i = 0; + const auto * lineIn_min = lineIn + j + ids_min; + + for (; i < ids_size - 1; i += 2) { + // Load 2 values from weight vector + // mmk = [wl_0 wh_0 wl_1 wh_1 wl_0 wh_0 wl_1 wh_1 ... ] + auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]); + + // Load one pixel per line + // RGBA: source1 = [ + // r0 g0 b0 a0 0 0 0 0 0 0 0 0 0 0 0 0 + // ] + // RGB: source1 = [ + // r0 g0 b0 r1 0 0 0 0 0 0 0 0 0 0 0 0 + // ] + auto source1 = mm_cvtsi32_si128(lineIn_min + i * data_size, i32_aligned); + auto source2 = mm_cvtsi32_si128(lineIn_min + (i + 1) * data_size, i32_aligned); + + // Interleave source1 and source2 and cast the result to epi16 + // RGBA: pix = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 a0 0 A0 0 + // ] + // RGB: pix = [ + // r0 0 R0 0 g0 0 G0 0 b0 0 B0 0 0 0 0 0 + // ] + auto source = _mm_unpacklo_epi8(source1, source2); + auto pix = _mm_unpacklo_epi8(source, zero); + // Compute output value as C += w0 * c0 + w1 * C0 for each channel in 32-bit precision + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + for (; i < ids_size; i++) { + auto mmk = _mm_set1_epi32(k[i]); + auto pix = mm_cvtepu8_epi32(lineIn_min + i * data_size, i32_aligned); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + sss = _mm_srai_epi32(sss, coefs_precision); + sss = _mm_packs_epi32(sss, zero); + sss = _mm_packus_epi16(sss, zero); + + auto o = _mm_cvtsi128_si32(sss); + + // Here we write 4 bytes to the output even if num_channels < 4, e.g o = {r,g,b,X} for num_channels=3 + // It is OK to write 4th byte (e.g. X) as on the next step we will overwrite it with new data. + // We also wont go out of bounds of lineOut memory allocation + std::memcpy(lineOut + j, (uint8_t *) &o, 4); + } + + for (; j < data_size; j += data_stride) { + auto sss = initial; + int64_t i = 0; + const auto * lineIn_min = lineIn + j + ids_min; + // For RGBA we can use (ids_size - 1) as tighter limit but for RGB we can read outside memory boundary + // for the last remaining line + for (; i < ids_size - 2; i += 2) { + // Load two coefficients at once + auto mmk = _mm_set1_epi32(*(int32_t*)&k[i]); + + // Load 2 lines + auto source1 = mm_cvtsi32_si128(lineIn_min + i * data_size, i32_aligned); + auto source2 = mm_cvtsi32_si128(lineIn_min + (i + 1) * data_size, i32_aligned); + + auto source = _mm_unpacklo_epi8(source1, source2); + auto pix = _mm_unpacklo_epi8(source, zero); + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + // Same processing as above but with a single weight value + for (; i < ids_size; i++) { + auto mmk = _mm_set1_epi32(k[i]); + + const uint8_t * p = lineIn_min + i * data_size; + __m128i pix; + // There is no much perf gain using more detailed condition like + // num_channels == 3 && ids_min + j + data_size * i + 4 >= in_max_size + // const int64_t in_max_size = data_size * in_ysize; + if (num_channels == 3) { + uint8_t input[4]; + std::memcpy(input, p, 3); + pix = mm_cvtepu8_epi32(input, true); + } else { + pix = mm_cvtepu8_epi32(p, true); + } + sss = _mm_add_epi32(sss, _mm_madd_epi16(pix, mmk)); + } + + // Convert fixed point values back to integers (truncating) + sss = _mm_srai_epi32(sss, coefs_precision); + // Convert packed signed 32-bit integers to packed 16-bit integers using signed saturation + // (a a a a b b b b c c c c d d d d) -> (a a b b c c d d) + sss = _mm_packs_epi32(sss, zero); + // Convert packed signed 16-bit integers to packed 8-bit integers using unsigned saturation + // (a a b b c c d d) -> (a b c d) + sss = _mm_packus_epi16(sss, zero); + // Store one pixel to the output + auto o = _mm_cvtsi128_si32(sss); + if (num_channels == 3 && C10_UNLIKELY(j + 4 >= data_size)) { + std::memcpy(lineOut + j, (uint8_t *) &o, 3); + } else { + std::memcpy(lineOut + j, (uint8_t *) &o, 4); + } + } +} + +} // anonymous namespace +#endif // CPU_CAPABILITY_AVX2 diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..6e1f3ec3b029177763568e01f63d7d1467483ccb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/WeightNormKernel.h @@ -0,0 +1,20 @@ +#pragma once +#include +#include + +namespace at { +class TensorBase; +} + +namespace at { namespace native { + +using weight_norm_fn = void(*)( + TensorBase&, TensorBase&, const TensorBase&, const TensorBase&, int64_t); +using weight_norm_backward_fn = void(*)( + TensorBase&, TensorBase&, const TensorBase&, const TensorBase&, + const TensorBase&, const TensorBase&, int64_t); + +DECLARE_DISPATCH(weight_norm_fn, weight_norm_stub); +DECLARE_DISPATCH(weight_norm_backward_fn, weight_norm_backward_stub); + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/mixed_data_type.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/mixed_data_type.h new file mode 100644 index 0000000000000000000000000000000000000000..ef598b281a905da24e74df13d95ef127828af3a1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/mixed_data_type.h @@ -0,0 +1,41 @@ +#pragma once + +#include + +namespace at { namespace native { + +inline ScalarType first_type() { + return ScalarType::Undefined; +} + +template +inline ScalarType first_type(const Tensor& arg, const Args&... parameters) { + return arg.defined() ? arg.scalar_type() : first_type(parameters...); +} + +template +inline bool is_mixed_type(const Tensor& input, const Args&... parameters) { + const auto parameter_type = first_type(parameters...); + return ((parameter_type != ScalarType::Undefined) && + (parameter_type != input.scalar_type())); +} + +// currently on CPU, mixed data type is only supported +// when input is 'BFloat16' or 'Half' and parameters are 'Float' +inline void check_mixed_data_type(const Tensor& input) { + TORCH_CHECK(at::isReducedFloatingType(input.scalar_type()), + "mixed dtype (CPU): all inputs must share same datatype."); +} + +template +inline void check_mixed_data_type(const Tensor& input, const Tensor& parameter, const Args&... parameters) { + TORCH_CHECK(!parameter.defined() || parameter.scalar_type() == ScalarType::Float, + "mixed dtype (CPU): expect parameter to have scalar type of Float"); + check_mixed_data_type(input, parameters...); +} + +inline ScalarType param_scalar_type(const Tensor& t, bool is_mixed_type) { + return is_mixed_type ? ScalarType::Float : t.scalar_type(); +} + +}} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/moments_utils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/moments_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..f5337f5ff4ebe4020f632b2c8f1c199aa79b20ec --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/moments_utils.h @@ -0,0 +1,206 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +inline namespace CPU_CAPABILITY { + +template using opmath_t = at::opmath_type; + +constexpr int64_t kChunkSize = 16; + +template +void AddMoments( + int64_t m0_add, + const T& m1_add, + const T& m2_add, + int64_t& m0, + T& m1, + T& m2) { + const int64_t n = m0 + m0_add; + const T c = n == 0 ? static_cast(0) : static_cast(m0_add) / static_cast(n); + const T delta = m1_add - m1; + m1 += c * delta; + m2 += m2_add + delta * delta * c * static_cast(m0); + m0 = n; +} + +template +C10_ALWAYS_INLINE void AddMomentsVec( + int64_t m0_add, + const vec::Vectorized& m1_add, + const vec::Vectorized& m2_add, + int64_t& m0, + vec::Vectorized& m1, + vec::Vectorized& m2) { + using Vec = vec::Vectorized; + const int64_t n = m0 + m0_add; + const T c = n == 0 ? static_cast(0) : static_cast(m0_add) / static_cast(n); + const Vec c_vec(c); + const Vec delta = m1_add - m1; + m1 += c_vec * delta; + m2 += m2_add + delta * delta * c_vec * Vec(static_cast(m0)); + m0 = n; +} + +template +inline typename std::enable_if>::value, void>::type +UpdateMomentsVec( + int64_t m0, + const T* X_ptr, + const std::array>, kChunkSize>& c_vecs, + int64_t& m0_stk0, + vec::Vectorized>& m1_stk0, + vec::Vectorized>& m2_stk0) { + using Vec = vec::Vectorized>; + Vec m1_vec(0); + Vec m2_vec(0); + for (const auto j : c10::irange(m0)) { + const Vec x_vec = Vec::loadu(X_ptr + j * Vec::size()); + const Vec delta_vec = x_vec - m1_vec; + m1_vec += delta_vec * c_vecs[j]; + m2_vec += delta_vec * (x_vec - m1_vec); + } + AddMomentsVec(m0, m1_vec, m2_vec, m0_stk0, m1_stk0, m2_stk0); +} + +// each bfloat16/half vector will be converted to two float vectors, +// and accumulated successively on m1_stk0/m2_stk0. +template +inline typename std::enable_if>::value, void>::type +UpdateMomentsVec( + int64_t m0, + const T* X_ptr, + const std::array>, kChunkSize>& c_vecs, + int64_t& m0_stk0, + vec::Vectorized>& m1_stk0, + vec::Vectorized>& m2_stk0) { + using Vec = vec::Vectorized; + using fVec = vec::Vectorized>; + fVec m1_fvec0(0), m1_fvec1(0); + fVec m2_fvec0(0), m2_fvec1(0); + for (const auto j : c10::irange(m0)) { + const Vec x_bvec = Vec::loadu(X_ptr + j * Vec::size()); + auto [x_fvec0, x_fvec1] = convert_to_float(x_bvec); + const fVec delta_fvec0 = x_fvec0 - m1_fvec0; + const fVec delta_fvec1 = x_fvec1 - m1_fvec1; + m1_fvec0 += delta_fvec0 * c_vecs[j]; + m1_fvec1 += delta_fvec1 * c_vecs[j]; + m2_fvec0 += delta_fvec0 * (x_fvec0 - m1_fvec0); + m2_fvec1 += delta_fvec1 * (x_fvec1 - m1_fvec1); + } + AddMomentsVec(m0, m1_fvec0, m2_fvec0, m0_stk0, m1_stk0, m2_stk0); + AddMomentsVec(m0, m1_fvec1, m2_fvec1, m0_stk0, m1_stk0, m2_stk0); +} + +// Compute rowwise moments by Welford algorithm and cascade sum to improve +// numerical stability. +// https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance +// https://en.wikipedia.org/wiki/Pairwise_summation +template +std::pair, opmath_t> RowwiseMomentsImpl(const T* X, int64_t N, int64_t ddof = 0) { + using math_t = opmath_t; + + constexpr int64_t kVecSize = vec::Vectorized::size(); + constexpr int64_t kAccVecSize = vec::Vectorized::size(); + const int64_t n = N / kVecSize; + const int64_t m = divup(n, kChunkSize); + const int64_t depth = utils::CeilLog2(m); + + using Vec = vec::Vectorized; + const Vec kZeroVec(math_t(0)); + c10::SmallVector m0_stk(depth, 0); + c10::SmallVector m1_stk(depth, kZeroVec); + c10::SmallVector m2_stk(depth, kZeroVec); + + for (const auto i : c10::irange(m)) { + const T* X_ptr = X + i * kChunkSize * kVecSize; + const int64_t m0 = std::min(kChunkSize, n - i * kChunkSize); + static std::array c_vecs = ([]() { + std::array result; + for (const auto i : c10::irange(kChunkSize)) { + result[i] = Vec(math_t(1) / static_cast(i + 1)); + } + return result; + })(); + UpdateMomentsVec(m0, X_ptr, c_vecs, m0_stk[0], m1_stk[0], m2_stk[0]); + + int64_t mask = i + 1; + for (int64_t j = 1; j < depth && (mask & 1) == 0; ++j) { + AddMomentsVec( + m0_stk[j - 1], + m1_stk[j - 1], + m2_stk[j - 1], + m0_stk[j], + m1_stk[j], + m2_stk[j]); + m0_stk[j - 1] = 0; + m1_stk[j - 1] = kZeroVec; + m2_stk[j - 1] = kZeroVec; + mask >>= 1; + } + } + for (const auto i : c10::irange(1, depth)) { + AddMomentsVec( + m0_stk[i], m1_stk[i], m2_stk[i], m0_stk[0], m1_stk[0], m2_stk[0]); + } + + std::array m1_arr{}; + std::array m2_arr{}; + m1_stk[0].store(m1_arr.data()); + m2_stk[0].store(m2_arr.data()); + + int64_t m0 = 0; + math_t m1 = 0; + math_t m2 = 0; + for (int64_t i = n * kVecSize; i < N; ++i) { + math_t x = static_cast(X[i]); + const math_t delta = x - m1; + ++m0; + m1 += delta / static_cast(m0); + m2 += delta * (x - m1); + } + // for BFloat16, each vector in m1_arr/m2_arr holds 2*n accumulated result + int64_t m0_add = n * kVecSize / kAccVecSize; + for (const auto i : c10::irange(kAccVecSize)) { + AddMoments(m0_add, m1_arr[i], m2_arr[i], m0, m1, m2); + } + + return std::make_pair(m1, m2 / static_cast(N - ddof)); +} + +template +std::pair, opmath_t> RowwiseMoments(const T* X, int64_t N, int64_t ddof = 0) { + using Vec = vec::Vectorized; + constexpr int64_t kVecSize = Vec::size(); + const int64_t n = N / kVecSize; + const int64_t m = divup(n, kChunkSize); + const int64_t depth = utils::CeilLog2(m); + if (depth <= 4) { + return RowwiseMomentsImpl(X, N, ddof); + } else if (depth <= 8) { + return RowwiseMomentsImpl(X, N, ddof); + } else if (depth <= 16) { + return RowwiseMomentsImpl(X, N, ddof); + } else if (depth <= 32) { + return RowwiseMomentsImpl(X, N, ddof); + } else { + return RowwiseMomentsImpl(X, N, ddof); + } +} + +} // namespace CPU_CAPABILITY +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..1aefc2474fdfc5cab69c89a8fdb1b8be982394e8 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cuda/TensorModeKernel.cuh @@ -0,0 +1,435 @@ +#pragma once + +#include +#include +#include +#include + +namespace at { +namespace native { + +// Used for a segmented reduction +struct ModeUnsignedBoolPair { + unsigned int val; + bool flag; +}; + +// In the kernel below, we have a common pattern of reducing (unsigned int, +// unsigned int) pairs of data +struct ModeUnsignedPair { + unsigned int val; + unsigned int index; +}; + +// Inclusive Scan via an upsweep/downsweep mechanism. Assumes: +// +// 1. Power2ScanSize is a power of 2. This code still works for collections that +// do not exactly contain a power of 2 number of elements, simply round up to +// the nearest power of 2 and then call. +// +// 2. That there are two-elements per thread, i.e. the size of the smem storage +// is 2 * blockDim.x * sizeof(T). +// +// Consider a (+)-Scan on the following elements: +// +// Upsweep: +// +// 0 1 2 3 4 5 6 7 +// 1 5 9 13 +// 6 22 +// 28 +// +// Downsweep: +// 15 +// 3 10 21 +template +__device__ void inclusivePrefixScan(T* smem, BinaryOp binop) { + // Reduce step ("upsweep") +#pragma unroll + for (int stride = 1; stride < Power2ScanSize; stride <<= 1) { + int index = (threadIdx.x + 1) * stride * 2 - 1; + if (index < Power2ScanSize) { + smem[index] = binop(smem[index], smem[index - stride]); + } + __syncthreads(); + } + + // Post-reduce step ("downsweep") +#pragma unroll + for (int stride = Power2ScanSize / 4; stride > 0; stride >>= 1) { + int index = (threadIdx.x + 1) * stride * 2 - 1; + if ((index + stride) < Power2ScanSize) { + smem[index + stride] = binop(smem[index + stride], smem[index]); + } + __syncthreads(); + } +} + +// Block-wide reduction where each thread locally reduces N +// values before letting a single warp take over - assumes +// threadVals is in registers, not shared memory +// +// If smem is not used again, there is no need to __syncthreads before this +// call. However, if smem will be used, e.g., this function is called in a loop, +// then __syncthreads is needed either before or afterwards to prevent non-0 +// threads overriding smem in the next loop before num-0 thread reads from it. +template +__device__ T reduceBlockWithNThreadLocalReductions( + T* smem, + T threadVals[N], + const unsigned int numVals, + ReduceOp reduceOp, + T init) { + int offset = threadIdx.x * N; + T local = offset < numVals ? threadVals[0] : init; + +#pragma unroll + for (int i = 1; i < N; ++i) { + ++offset; + T next = offset < numVals ? threadVals[i] : init; + local = reduceOp.combine(local, next); + } + + return cuda_utils::BlockReduce(local, reduceOp, init, smem); +} + +template +__device__ inline void swapVars(T& t1, T& t2) { + T tmp = t1; + t1 = t2; + t2 = tmp; +} + +template +__device__ inline void bitonicSwap( + K& kA, + V& vA, + bool& validA, + K& kB, + V& vB, + bool& validB, + bool dir, + const Comparator& comp) { + // Invalid entries always sort to the end + bool swap = (comp(kA, kB) && validA) || !validB; + if (swap == dir) { + swapVars(kA, kB); + swapVars(vA, vB); + swapVars(validA, validB); + } +}; + +template +__device__ inline void bitonicSwapKeys( + K& kA, + bool& validA, + K& kB, + bool& validB, + bool dir, + const Comparator& comp) { + bool swap = (comp(kA, kB) && validA) || !validB; + if (swap == dir) { + swapVars(kA, kB); + swapVars(validA, validB); + } +} + +template < + typename K, + typename IndexType, + int Power2SortSize, + typename Comparator> +__device__ inline void bitonicSortKeys( + K keys[Power2SortSize], + bool valid[Power2SortSize], + const Comparator& comp) { +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int size = 2; size < Power2SortSize; size *= 2) { + bool flag = ((threadIdx.x & (size / 2)) != 0); + +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int stride = size / 2; stride > 0; stride /= 2) { + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwapKeys( + keys[pos], + valid[pos], + keys[pos + stride], + valid[pos + stride], + flag, + comp); + } + } + +#if !defined(USE_ROCM) +#pragma unroll +#endif + for (unsigned int stride = Power2SortSize / 2; stride > 0; stride /= 2) { + __syncthreads(); + + unsigned int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); + bitonicSwapKeys( + keys[pos], + valid[pos], + keys[pos + stride], + valid[pos + stride], + false, + comp); + } + + __syncthreads(); +} + +// The mode kernel has the following characteristics: It uses internal shared +// memory buffers of Power2Size, which must be greater than the number of +// elements. Additionally, there is one block for every slice to calculate the +// mode for, and in each block there is one thread for every two elements. +// +// Both sorted and positions are assumed to be contiguous Tensors with the mode +// dimension as the innermost dim, such that we can get the particular slice for +// a Tensor via its linear block dimension * the slice size. +template +#if defined(CUDA_VERSION) && CUDA_VERSION >= 11070 +__launch_bounds__(1024, 1) +#endif +__global__ void compute_mode( + const T* input, + at::cuda::detail::TensorInfo values, + at::cuda::detail::TensorInfo indices, + int64_t sliceSize, + int64_t slices) { + int tidx = threadIdx.x; + int stidx = blockDim.x + threadIdx.x; // Second index this thread responsible for + + // First, we need to calculate the offset into the sorted Tensor that + // represents the start of the slice for this block to calculate the mode for. + // This offset is a combination of the gridIndices, and the number of elements + // in the slice. + unsigned int blockId = getLinearBlockId(); + unsigned int linearOffset = blockId * sliceSize; + + if (blockId >= slices) { + return; + } + + // shmem is a dynamically sized buffer we will use throughout the kernel to + // handle computation efficiently. The size of this shmem must be + // sizeof(T) * Power2Size + (2 * sizeof(unsigned int) * Power2Size) + // + // Initially, the buffer will be organized as follows: + // + // [smem (slice elements) | bmem (valid indices) | ] + extern __shared__ char shmem[]; + + // smem represents a proportion of the shared memory buffer that is used to + // store the elements from the slice: + T* smem = reinterpret_cast(shmem); + + // Each thread loads up to two elements from the Tensor into shared memory + if (tidx < sliceSize) { + smem[tidx] = c10::load(&input[linearOffset + tidx]); + } + if (stidx < sliceSize) { + smem[stidx] = c10::load(&input[linearOffset + stidx]); + } + + // Next, we initialize a boolean region of the buffer, offset by the loaded + // element smem region + bool* bmem = reinterpret_cast(&smem[Power2Size]); + + // The first use of this region stores bmem[i] = i < sliceSize to mark the + // valid components in the smem buffer + bmem[tidx] = tidx < sliceSize; + bmem[stidx] = stidx < sliceSize; + __syncthreads(); // barrier for smem, bmem initialization + + // First, sort the input slice in ascending order. smem contains the input + // elements, and bmem marks the valid indices + bitonicSortKeys( + smem, bmem, [&] GPU_LAMBDA(const auto& a, const auto& b) { + return a < b; + }); + __syncthreads(); // make no assumptions that the sort syncs at end + + // The next step of our algorithm is performing a block-wide comparison of + // neighboring elements. In particular, given an sorted input slice A, we + // produce an output slice B, such that B[i] = 1 if A[i-i] != A[i], otherwise + // 0. + // + // Given the input A = [0, 0, 1, 1, 2, 2, 2, 4, 5, 6, 6, 7, 8] + // B = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1] + // + // In particular, we can think of B[i] true indicating the start of a sequence + // of equal values in the sorted list. Similarly, we will also store the + // negation of B, which we'll call C. In particular, we can think of C[i] = + // true iff A[i-1] == A[i] in our original sorted slice. + // + // C = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0] + + // We overwrite bmem, and treat the rest of shared memory as a buffer of + // (index, flag) pairs where the index represents values from C, and the flag + // represents values from B. + // + // [smem (sorted slice) | ubpmem (index, flag pairs)] + + struct ModeUnsignedBoolPair* ubpmem = + reinterpret_cast(&smem[Power2Size]); + + if (tidx == 0) { + ubpmem[0].flag = true; + ubpmem[0].val = 0; + } + + // Compares elements (0, 1), (2, 3), ... and sets 1, 3, ... + ubpmem[tidx * 2 + 1].flag = + smem[tidx * 2] != smem[tidx * 2 + 1]; // (0, 1), (1, 2), etc. + ubpmem[tidx * 2 + 1].val = !ubpmem[tidx * 2 + 1].flag; + + // Compares elements (1, 2), (3, 4), ... and sets 2, 4, ... + if (((tidx + 1) * 2) < Power2Size) { + ubpmem[(tidx + 1) * 2].flag = + smem[((tidx + 1) * 2) - 1] != smem[(tidx + 1) * 2]; + ubpmem[(tidx + 1) * 2].val = !ubpmem[(tidx + 1) * 2].flag; + } + __syncthreads(); // barrier for ubpmem initialization + + // Next, we perform a segmented prefix sum on the neighboring elements, where + // the presence of a one indicates the start of a segment. In this case B acts + // as the segment start flags, and C is the buffer to be summed: + // + // Input (C) = [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0] + // Flag (B) = [1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1] + // Output (C) = [0, 1, 0, 1, 0, 1, 2, 0, 0, 0, 1, 0, 0] + // + // Afterwards, the (index) components of the ubpmem buffer contain the lengths + // of the segments (minus 1), i.e. the counts of each element in the original + // input. + inclusivePrefixScan( + ubpmem, [=] GPU_LAMBDA(const auto& a, const auto& b) { + ModeUnsignedBoolPair c; + c.val = a.flag ? a.val : a.val + b.val; + c.flag = a.flag | b.flag; + return c; + }); + // assumes scan syncs at the end + + // Next, we reinterpret the ubpmem buffer as pairs of unsigned integers (i.e. + // we treat the boolean flag regions as integers). We initialize these to + // represent indices, and we'll call this buffer I + struct ModeUnsignedPair* uupmem = + reinterpret_cast(ubpmem); + + // At this point, we need to find the maximum element in lengths buffer C. + // This element will represent the count (-1) of the mode. Because of the + // way we have set up the problem, the index where this mode occurs will + // also be the location of the mode value in the sorted array, e.g. + // + // smem = [0, 0, 1, 1, 1, 2] + // C = [0, 1, 0, 1, 2, 0] + // I = [0, 1, 2, 3, 4, 5] + // ^ + // maximum value, also aligned with mode = 1 + // + // We perform a block wide max-reduction of the C buffer, but we also need the + // indices to come along with it, so we utilize the uupmem construction. + // + // At the end we need to return the ModeUnsignedPair containing index = 4, val + // = 2, which represents the max + + // In practice, we will make each thread locally reduce 2 values in its + // registers prior to the global block-wide reduction. Note that instead of + // tidx/stidx, we utilize tidx * 2, tidx * 2 + 1, so each thread deals with + // adjacent elements. This is because the reduce code below relies on thread + // elements to be adjacent. + struct ModeUnsignedPair uup[2]; + uup[0].index = tidx * 2; + uup[0].val = ubpmem[tidx * 2].val; + uup[1].index = tidx * 2 + 1; + uup[1].val = ubpmem[tidx * 2 + 1].val; + __syncthreads(); + + struct ModeUnsignedPair max = {0, 0}; + + struct MaxOp { + inline __device__ ModeUnsignedPair combine(ModeUnsignedPair a, ModeUnsignedPair b) const { + return b.val > a.val ? b : a; + } + + inline __device__ ModeUnsignedPair warp_shfl_down(ModeUnsignedPair acc, int offset) const { + ModeUnsignedPair ret; + ret.index = WARP_SHFL_DOWN(acc.index, offset); + ret.val = WARP_SHFL_DOWN(acc.val, offset); + return ret; + } + } max_op; + + max = reduceBlockWithNThreadLocalReductions<2>( + uupmem, + uup, + sliceSize, + max_op, + max); + + // Store the mode in shared memory for use in finding the mode in the input + // slice + __shared__ T mode; + + // Given the above constraints, the mode is the value at the reduced index in + // the original sorted element buffer + if (tidx == 0) { + mode = smem[max.index]; + } + __syncthreads(); // broadcast mode + + // Finally, we need to find "an" index of the mode in the input + // Tensor. The API does not constrain which index we pick, but here + // we always pick the largest index. We store the index if the value + // is the mode, or 0 otherwise. Then find the maximum value. + // + // Again we reduce 2 elements in the thread's registers prior to the + // block-wide reduction + unsigned mode_index[2] = {0u, 0u}; + if (tidx * 2 < sliceSize) { + const unsigned idx = tidx * 2; + mode_index[0] = c10::load(&input[linearOffset + idx]) == mode ? idx : 0u; + } + if (tidx * 2 + 1 < sliceSize) { + const unsigned idx = tidx * 2 + 1; + mode_index[1] = c10::load(&input[linearOffset + idx]) == mode ? idx : 0u; + } + + struct MaxIndexOp { + inline __device__ unsigned combine(unsigned a, unsigned b) const { + return b > a ? b : a; + } + + inline __device__ unsigned warp_shfl_down(unsigned acc, int offset) const { + return WARP_SHFL_DOWN(acc, offset); + } + } max_index_op; + + int64_t index = reduceBlockWithNThreadLocalReductions<2>( + reinterpret_cast(&shmem[0]), + mode_index, + sliceSize, + max_index_op, + 0u); + + // Finally, we have the mode, and an index where it occurs. We use a single + // thread to place this in the appropriate output position + if (tidx == 0) { + unsigned int outputOffset = + at::cuda::detail::IndexToOffset::get( + blockId, values); + values.data[outputOffset] = mode; + indices.data[outputOffset] = index; + } +} + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorBinaryOps.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorBinaryOps.h new file mode 100644 index 0000000000000000000000000000000000000000..51eeaf29191112356e8ad0d32b8aebf59b3f0733 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorBinaryOps.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include + +namespace at { +namespace native { + +enum class NESTED_DENSE_OP: uint8_t {ADD, MUL}; + +using nested_dense_elementwise_fn = void (*)(Tensor& result, const Tensor & self, const Tensor & other, const NESTED_DENSE_OP& op); + +DECLARE_DISPATCH(nested_dense_elementwise_fn, nested_dense_elementwise_stub); + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/vol2col.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/vol2col.h new file mode 100644 index 0000000000000000000000000000000000000000..ccbfc69ce3c621bba863ceabdc133a16a8590bc6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/vol2col.h @@ -0,0 +1,109 @@ +#pragma once + +#include + +namespace at::native { + +template +static void vol2col( + const T* data_vol, + const int64_t channels, + const int64_t depth, + const int64_t height, + const int64_t width, + const int64_t depth_col, + const int64_t height_col, + const int64_t width_col, + const int64_t kT, + const int64_t kernel_height, + const int64_t kernel_width, + const int64_t pT, + const int64_t pH, + const int64_t pW, + const int64_t dT, + const int64_t dH, + const int64_t dW, + const int64_t dilationT, + const int64_t dilationH, + const int64_t dilationW, + T* data_col) { + int64_t c, t, h, w; + int64_t channels_col = channels * kT * kernel_height * kernel_width; + for (c = 0; c < channels_col; ++c) { + int64_t w_offset = c % kernel_width; + int64_t h_offset = (c / kernel_width) % kernel_height; + int64_t t_offset = (c / kernel_width / kernel_height) % kT; + int64_t c_vol = c / kT / kernel_height / kernel_width; + for (t = 0; t < depth_col; ++t) { + int64_t t_pad = t * dT - pT + t_offset * dilationT; + for (h = 0; h < height_col; ++h) { + int64_t h_pad = h * dH - pH + h_offset * dilationH; + for (w = 0; w < width_col; ++w) { + int64_t w_pad = w * dW - pW + w_offset * dilationW; + if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && + w_pad >= 0 && w_pad < width) + data_col[((c * depth_col + t) * height_col + h) * width_col + w] = + data_vol + [((c_vol * depth + t_pad) * height + h_pad) * width + + w_pad]; + else + data_col[((c * depth_col + t) * height_col + h) * width_col + w] = + 0; + } + } + } + } +} + +template +static void col2vol( + const T* data_col, + const int64_t channels, + const int64_t depth, + const int64_t height, + const int64_t width, + const int64_t out_depth, + const int64_t out_height, + const int64_t out_width, + const int64_t kT, + const int64_t kernel_height, + const int64_t kernel_width, + const int64_t pT, + const int64_t pH, + const int64_t pW, + const int64_t dT, + const int64_t dH, + const int64_t dW, + const int64_t dilationT, + const int64_t dilationH, + const int64_t dilationW, + T* data_vol) { + memset(data_vol, 0, sizeof(T) * depth * height * width * channels); + int64_t depth_col = out_depth; + int64_t height_col = out_height; + int64_t width_col = out_width; + int64_t channels_col = channels * kT * kernel_height * kernel_width; + for (int64_t c = 0; c < channels_col; ++c) { + int64_t w_offset = c % kernel_width; + int64_t h_offset = (c / kernel_width) % kernel_height; + int64_t t_offset = (c / kernel_width / kernel_height) % kT; + int64_t c_vol = c / kT / kernel_height / kernel_width; + for (int64_t t = 0; t < depth_col; ++t) { + int64_t t_pad = t * dT - pT + t_offset * dilationT; + for (int64_t h = 0; h < height_col; ++h) { + int64_t h_pad = h * dH - pH + h_offset * dilationH; + for (int64_t w = 0; w < width_col; ++w) { + int64_t w_pad = w * dW - pW + w_offset * dilationW; + if (t_pad >= 0 && t_pad < depth && h_pad >= 0 && h_pad < height && + w_pad >= 0 && w_pad < width) + data_vol + [((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] += + data_col + [((c * depth_col + t) * height_col + h) * width_col + w]; + } + } + } + } +} + +} // namespace at::native diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cast_Long_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cast_Long_native.h new file mode 100644 index 0000000000000000000000000000000000000000..291640d2a8d96e3bb2e1cdb865b1bb60b2f43d18 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cast_Long_native.h @@ -0,0 +1,21 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor _cast_Long(const at::Tensor & self, bool non_blocking=false); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cufft_clear_plan_cache_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cufft_clear_plan_cache_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..72d5f19e897c355777bd05526b2636843b095493 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cufft_clear_plan_cache_compositeimplicitautograd_dispatch.h @@ -0,0 +1,23 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API void _cufft_clear_plan_cache(at::DeviceIndex device_index); + +} // namespace compositeimplicitautograd +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fft_c2c_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fft_c2c_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..73bda3fee22503499bfac0a21bed71c1411de498 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fft_c2c_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _fft_c2c { + using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_fft_c2c") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor") + static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef dim, int64_t normalization, bool forward); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef dim, int64_t normalization, bool forward); +}; + +struct TORCH_API _fft_c2c_out { + using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, int64_t, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_fft_c2c") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef dim, int64_t normalization, bool forward, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef dim, int64_t normalization, bool forward, at::Tensor & out); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_expm1_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_expm1_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..23021887b559d20b51da2b961b21b210c14563c6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_expm1_ops.h @@ -0,0 +1,50 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _foreach_expm1 { + using schema = ::std::vector (at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_expm1") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_expm1(Tensor[] self) -> Tensor[]") + static ::std::vector call(at::TensorList self); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self); +}; + +struct TORCH_API _foreach_expm1_ { + using schema = void (at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_expm1_") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_expm1_(Tensor(a!)[] self) -> ()") + static void call(at::TensorList self); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self); +}; + +struct TORCH_API _foreach_expm1_out { + using schema = void (at::TensorList, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_expm1") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_expm1.out(Tensor[] self, *, Tensor(a!)[] out) -> ()") + static void call(at::TensorList self, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log1p_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log1p_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ce50d61345e05742017d1a337c68bf56dcaffd22 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log1p_cuda_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API ::std::vector _foreach_log1p(at::TensorList self); +TORCH_API void _foreach_log1p_(at::TensorList self); + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_reciprocal_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_reciprocal_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..946bec271173fa5f4d2f0150300ac2ea24823d90 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_reciprocal_ops.h @@ -0,0 +1,50 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _foreach_reciprocal { + using schema = ::std::vector (at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_reciprocal") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_reciprocal(Tensor[] self) -> Tensor[]") + static ::std::vector call(at::TensorList self); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self); +}; + +struct TORCH_API _foreach_reciprocal_ { + using schema = void (at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_reciprocal_") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_reciprocal_(Tensor(a!)[] self) -> ()") + static void call(at::TensorList self); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self); +}; + +struct TORCH_API _foreach_reciprocal_out { + using schema = void (at::TensorList, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_reciprocal") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_reciprocal.out(Tensor[] self, *, Tensor(a!)[] out) -> ()") + static void call(at::TensorList self, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_indices_copy.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_indices_copy.h new file mode 100644 index 0000000000000000000000000000000000000000..4132ea8c0d6058d8be5c29c899c0a6200db7f773 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_indices_copy.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_indices_copy(Tensor self) -> Tensor +inline at::Tensor _indices_copy(const at::Tensor & self) { + return at::_ops::_indices_copy::call(self); +} + +// aten::_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _indices_copy_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::_indices_copy_out::call(self, out); +} +// aten::_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _indices_copy_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::_indices_copy_out::call(self, out); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_linalg_svd_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_linalg_svd_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f1d92b9efe1a4c6ef1f31a194a36b0c3b7dd8170 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_linalg_svd_meta_dispatch.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API ::std::tuple _linalg_svd(const at::Tensor & A, bool full_matrices=false, bool compute_uv=true, c10::optional driver=c10::nullopt); +TORCH_API ::std::tuple _linalg_svd_out(at::Tensor & U, at::Tensor & S, at::Tensor & Vh, const at::Tensor & A, bool full_matrices=false, bool compute_uv=true, c10::optional driver=c10::nullopt); +TORCH_API ::std::tuple _linalg_svd_outf(const at::Tensor & A, bool full_matrices, bool compute_uv, c10::optional driver, at::Tensor & U, at::Tensor & S, at::Tensor & Vh); + +} // namespace meta +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_lstm_mps.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_lstm_mps.h new file mode 100644 index 0000000000000000000000000000000000000000..89e6e7fa5602e19c283dc0baa500a27255dc3812 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_lstm_mps.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_lstm_mps(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) +inline ::std::tuple _lstm_mps(const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) { + return at::_ops::_lstm_mps::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first); +} + +// aten::_lstm_mps.out(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4, Tensor(f!) out5) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!), Tensor(f!)) +inline ::std::tuple _lstm_mps_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, at::Tensor & out5, const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) { + return at::_ops::_lstm_mps_out::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first, out0, out1, out2, out3, out4, out5); +} +// aten::_lstm_mps.out(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2, Tensor(d!) out3, Tensor(e!) out4, Tensor(f!) out5) -> (Tensor(a!), Tensor(b!), Tensor(c!), Tensor(d!), Tensor(e!), Tensor(f!)) +inline ::std::tuple _lstm_mps_outf(const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, at::Tensor & out5) { + return at::_ops::_lstm_mps_out::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first, out0, out1, out2, out3, out4, out5); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_get_values_copy_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_get_values_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..d6067159fa3d3dd1d9c320bca4f5f37ad947cd2d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_get_values_copy_native.h @@ -0,0 +1,22 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _nested_get_values_copy_out(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor _nested_get_values_copy(const at::Tensor & self); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_mask_projection_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_mask_projection_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..420e3703d34a14b61e41a6251b2e8b84feb0580d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_mask_projection_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _sparse_mask_projection { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_mask_projection") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor") + static at::Tensor call(const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches); +}; + +struct TORCH_API _sparse_mask_projection_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_mask_projection") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_mask_projection.out(Tensor self, Tensor mask, bool accumulate_matches=False, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches, at::Tensor & out); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_spdiags.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_spdiags.h new file mode 100644 index 0000000000000000000000000000000000000000..0c8bb5b0a77a017fbc1769eb541f924b2315c40b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_spdiags.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_spdiags(Tensor diagonals, Tensor offsets, int[] shape, Layout? layout=None) -> Tensor +inline at::Tensor _spdiags(const at::Tensor & diagonals, const at::Tensor & offsets, at::IntArrayRef shape, c10::optional layout=c10::nullopt) { + return at::_ops::_spdiags::call(diagonals, offsets, shape, layout); +} + +// aten::_spdiags.out(Tensor diagonals, Tensor offsets, int[] shape, Layout? layout=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _spdiags_out(at::Tensor & out, const at::Tensor & diagonals, const at::Tensor & offsets, at::IntArrayRef shape, c10::optional layout=c10::nullopt) { + return at::_ops::_spdiags_out::call(diagonals, offsets, shape, layout, out); +} +// aten::_spdiags.out(Tensor diagonals, Tensor offsets, int[] shape, Layout? layout=None, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _spdiags_outf(const at::Tensor & diagonals, const at::Tensor & offsets, at::IntArrayRef shape, c10::optional layout, at::Tensor & out) { + return at::_ops::_spdiags_out::call(diagonals, offsets, shape, layout, out); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_optional_intlist_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_optional_intlist_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..5799d125ef94b3fd8cb9fb38fc7e2cf63e828f77 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_optional_intlist_compositeexplicitautograd_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _test_optional_intlist_out(at::Tensor & out, const at::Tensor & values, at::OptionalIntArrayRef addends); +TORCH_API at::Tensor & _test_optional_intlist_outf(const at::Tensor & values, at::OptionalIntArrayRef addends, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..55fab16fb10461c6962cc1f107f790d2ffb20002 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_cpu_dispatch.h @@ -0,0 +1,28 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _upsample_bilinear2d_aa(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional scales_h=c10::nullopt, c10::optional scales_w=c10::nullopt); +TORCH_API at::Tensor _upsample_bilinear2d_aa_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional scales_h=c10::nullopt, c10::optional scales_w=c10::nullopt); +TORCH_API at::Tensor & _upsample_bilinear2d_aa_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional scales_h=c10::nullopt, c10::optional scales_w=c10::nullopt); +TORCH_API at::Tensor & _upsample_bilinear2d_aa_outf(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional scales_h, c10::optional scales_w, at::Tensor & out); +TORCH_API at::Tensor & _upsample_bilinear2d_aa_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional scales_h=c10::nullopt, c10::optional scales_w=c10::nullopt); +TORCH_API at::Tensor & _upsample_bilinear2d_aa_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional scales_h, c10::optional scales_w, at::Tensor & out); + +} // namespace cpu +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/abs_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/abs_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..fb0a3bcf28c239d269e2cef4acbc1b20c1fe81b2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/abs_ops.h @@ -0,0 +1,50 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API abs { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::abs") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "abs(Tensor self) -> Tensor") + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API abs_ { + using schema = at::Tensor & (at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::abs_") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "abs_(Tensor(a!) self) -> Tensor(a!)") + static at::Tensor & call(at::Tensor & self); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self); +}; + +struct TORCH_API abs_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::abs") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "abs.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/aminmax.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/aminmax.h new file mode 100644 index 0000000000000000000000000000000000000000..bca1f59c0cf037d7fe7c4e81e3187f4ac9be21ec --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/aminmax.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::aminmax(Tensor self, *, int? dim=None, bool keepdim=False) -> (Tensor min, Tensor max) +inline ::std::tuple aminmax(const at::Tensor & self, c10::optional dim=c10::nullopt, bool keepdim=false) { + return at::_ops::aminmax::call(self, dim, keepdim); +} + +// aten::aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max) +inline ::std::tuple aminmax_out(at::Tensor & min, at::Tensor & max, const at::Tensor & self, c10::optional dim=c10::nullopt, bool keepdim=false) { + return at::_ops::aminmax_out::call(self, dim, keepdim, min, max); +} +// aten::aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max) +inline ::std::tuple aminmax_outf(const at::Tensor & self, c10::optional dim, bool keepdim, at::Tensor & min, at::Tensor & max) { + return at::_ops::aminmax_out::call(self, dim, keepdim, min, max); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arcsinh_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arcsinh_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7e2987cc5ce81e8d6764dfb6fa3951b257eaeed9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arcsinh_compositeimplicitautograd_dispatch.h @@ -0,0 +1,26 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor arcsinh(const at::Tensor & self); +TORCH_API at::Tensor & arcsinh_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & arcsinh_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & arcsinh_(at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/avg_pool2d_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/avg_pool2d_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f85e61dd4f6f0b36825a680506b715c1bc7e6596 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/avg_pool2d_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API avg_pool2d_out { + using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool, bool, c10::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::avg_pool2d") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional divisor_override, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional divisor_override, at::Tensor & out); +}; + +struct TORCH_API avg_pool2d { + using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool, bool, c10::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::avg_pool2d") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor") + static at::Tensor call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional divisor_override); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional divisor_override); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/batch_norm_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/batch_norm_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..fdb1f20510b0db48ffb8e76b3c1a90faa9515c3b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/batch_norm_compositeimplicitautograd_dispatch.h @@ -0,0 +1,23 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor batch_norm(const at::Tensor & input, const c10::optional & weight, const c10::optional & bias, const c10::optional & running_mean, const c10::optional & running_var, bool training, double momentum, double eps, bool cudnn_enabled); + +} // namespace compositeimplicitautograd +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/block_diag_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/block_diag_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c1f8abf82221ca4add8642fa8c050cd1fff54d5c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/block_diag_native.h @@ -0,0 +1,22 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor block_diag(at::TensorList tensors); +TORCH_API at::Tensor & block_diag_out(at::TensorList tensors, at::Tensor & out); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3427483dc742ceb26cf35ae0c68792a51bcf6a7b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeimplicitautograd_dispatch.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor cat(at::TensorList tensors, at::Dimname dim); +TORCH_API at::Tensor & cat_out(at::Tensor & out, at::TensorList tensors, at::Dimname dim); +TORCH_API at::Tensor & cat_outf(at::TensorList tensors, at::Dimname dim, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_native.h new file mode 100644 index 0000000000000000000000000000000000000000..fee76e883c81ec174a9aaac59cc21023ee3d7595 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_native.h @@ -0,0 +1,32 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_cat_out_cpu : public at::meta::structured_cat { +void impl(const at::ITensorListRef & tensors, int64_t dim, int64_t valid, bool all_contiguous, bool all_same_dtype, bool all_same_sizes_and_stride, at::MemoryFormat memory_format, const at::Tensor & out); +}; +struct TORCH_API structured_cat_out_cuda : public at::meta::structured_cat { +void impl(const at::ITensorListRef & tensors, int64_t dim, int64_t valid, bool all_contiguous, bool all_same_dtype, bool all_same_sizes_and_stride, at::MemoryFormat memory_format, const at::Tensor & out); +}; +TORCH_API at::Tensor cat_nested(const at::ITensorListRef & tensors, int64_t dim=0); +TORCH_API at::Tensor cat_sparse(const at::ITensorListRef & tensors, int64_t dim=0); +TORCH_API at::Tensor cat_quantized_cpu(const at::ITensorListRef & tensors, int64_t dim=0); +TORCH_API at::Tensor & cat_out_quantized_cpu(const at::ITensorListRef & tensors, int64_t dim, at::Tensor & out); +TORCH_API at::Tensor cat(at::TensorList tensors, at::Dimname dim); +TORCH_API at::Tensor & cat_out(at::TensorList tensors, at::Dimname dim, at::Tensor & out); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/concat_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/concat_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2dcdbbd066d8ec8a4c3c5e3a5e4474079d4d24cc --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/concat_ops.h @@ -0,0 +1,61 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API concat { + using schema = at::Tensor (at::TensorList, int64_t); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::concat") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "concat(Tensor[] tensors, int dim=0) -> Tensor") + static at::Tensor call(at::TensorList tensors, int64_t dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim); +}; + +struct TORCH_API concat_out { + using schema = at::Tensor & (at::TensorList, int64_t, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::concat") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "concat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(at::TensorList tensors, int64_t dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, int64_t dim, at::Tensor & out); +}; + +struct TORCH_API concat_names { + using schema = at::Tensor (at::TensorList, at::Dimname); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::concat") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "names") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "concat.names(Tensor[] tensors, Dimname dim) -> Tensor") + static at::Tensor call(at::TensorList tensors, at::Dimname dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Dimname dim); +}; + +struct TORCH_API concat_names_out { + using schema = at::Tensor & (at::TensorList, at::Dimname, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::concat") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "names_out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "concat.names_out(Tensor[] tensors, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(at::TensorList tensors, at::Dimname dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList tensors, at::Dimname dim, at::Tensor & out); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_batch_norm_backward_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_batch_norm_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..52cc4a4123d0a3417aec569f52b2a79cf2c117c6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_batch_norm_backward_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API cudnn_batch_norm_backward { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const c10::optional &, const c10::optional &, const c10::optional &, const c10::optional &, double, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::cudnn_batch_norm_backward") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "cudnn_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace) -> (Tensor, Tensor, Tensor)") + static ::std::tuple call(const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional & running_mean, const c10::optional & running_var, const c10::optional & save_mean, const c10::optional & save_var, double epsilon, const at::Tensor & reserveSpace); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional & running_mean, const c10::optional & running_var, const c10::optional & save_mean, const c10::optional & save_var, double epsilon, const at::Tensor & reserveSpace); +}; + +struct TORCH_API cudnn_batch_norm_backward_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const c10::optional &, const c10::optional &, const c10::optional &, const c10::optional &, double, const at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::cudnn_batch_norm_backward") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "cudnn_batch_norm_backward.out(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, Tensor reserveSpace, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))") + static ::std::tuple call(const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional & running_mean, const c10::optional & running_var, const c10::optional & save_mean, const c10::optional & save_var, double epsilon, const at::Tensor & reserveSpace, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional & running_mean, const c10::optional & running_var, const c10::optional & save_mean, const c10::optional & save_var, double epsilon, const at::Tensor & reserveSpace, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..946edb16883ffccb46a7ecf3811f29ffa77664de --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h @@ -0,0 +1,23 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor cudnn_grid_sampler(const at::Tensor & self, const at::Tensor & grid); + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/exp_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/exp_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..682ddfa2f71fc50165b3c0fd4ec20d9800c6a678 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/exp_meta_dispatch.h @@ -0,0 +1,26 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor exp(const at::Tensor & self); +TORCH_API at::Tensor & exp_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & exp_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & exp_(at::Tensor & self); + +} // namespace meta +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fake_quantize_per_channel_affine.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fake_quantize_per_channel_affine.h new file mode 100644 index 0000000000000000000000000000000000000000..d3aabb82c9a37db4669e2861e31407fd5532eed7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fake_quantize_per_channel_affine.h @@ -0,0 +1,30 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor +inline at::Tensor fake_quantize_per_channel_affine(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max) { + return at::_ops::fake_quantize_per_channel_affine::call(self, scale, zero_point, axis, quant_min, quant_max); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/full_like_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/full_like_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e1ec9f95353fbabea71d32c04d04d39313e711f9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/full_like_compositeexplicitautograd_dispatch.h @@ -0,0 +1,26 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor full_like(const at::Tensor & self, const at::Scalar & fill_value, at::TensorOptions options={}, c10::optional memory_format=c10::nullopt); +TORCH_API at::Tensor full_like(const at::Tensor & self, const at::Scalar & fill_value, c10::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory, c10::optional memory_format); +TORCH_API at::Tensor & full_like_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & fill_value, c10::optional memory_format=c10::nullopt); +TORCH_API at::Tensor & full_like_outf(const at::Tensor & self, const at::Scalar & fill_value, c10::optional memory_format, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gcd.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gcd.h new file mode 100644 index 0000000000000000000000000000000000000000..ea16ad5d580e51176c508f6058b4263834b6c044 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gcd.h @@ -0,0 +1,44 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & gcd_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) { + return at::_ops::gcd_out::call(self, other, out); +} +// aten::gcd.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & gcd_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) { + return at::_ops::gcd_out::call(self, other, out); +} + +// aten::gcd(Tensor self, Tensor other) -> Tensor +inline at::Tensor gcd(const at::Tensor & self, const at::Tensor & other) { + return at::_ops::gcd::call(self, other); +} + +// aten::gcd_(Tensor(a!) self, Tensor other) -> Tensor(a!) +inline at::Tensor & gcd_(at::Tensor & self, const at::Tensor & other) { + return at::_ops::gcd_::call(self, other); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gru.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gru.h new file mode 100644 index 0000000000000000000000000000000000000000..3c53bfd7bf09dccf3d6fa7959bd28fed9580aadb --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gru.h @@ -0,0 +1,35 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::gru.input(Tensor input, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor) +inline ::std::tuple gru(const at::Tensor & input, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) { + return at::_ops::gru_input::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first); +} + +// aten::gru.data(Tensor data, Tensor batch_sizes, Tensor hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor) +inline ::std::tuple gru(const at::Tensor & data, const at::Tensor & batch_sizes, const at::Tensor & hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) { + return at::_ops::gru_data::call(data, batch_sizes, hx, params, has_biases, num_layers, dropout, train, bidirectional); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/layer_norm_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/layer_norm_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..27ca501b266713d875588e95f7c3f6875f69eeea --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/layer_norm_compositeimplicitautograd_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional & weight={}, const c10::optional & bias={}, double eps=1e-05, bool cudnn_enable=true); +TORCH_API at::Tensor layer_norm_symint(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional & weight={}, const c10::optional & bias={}, double eps=1e-05, bool cudnn_enable=true); + +} // namespace compositeimplicitautograd +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_solve_ex_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_solve_ex_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..a65e13fc5a6bf4e6fc605adc10c86e280c0b1dc5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_solve_ex_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API linalg_solve_ex { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, bool, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::linalg_solve_ex") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor info)") + static ::std::tuple call(const at::Tensor & A, const at::Tensor & B, bool left, bool check_errors); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, const at::Tensor & B, bool left, bool check_errors); +}; + +struct TORCH_API linalg_solve_ex_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, bool, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::linalg_solve_ex") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "linalg_solve_ex.out(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) info) -> (Tensor(a!) result, Tensor(b!) info)") + static ::std::tuple call(const at::Tensor & A, const at::Tensor & B, bool left, bool check_errors, at::Tensor & result, at::Tensor & info); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, const at::Tensor & B, bool left, bool check_errors, at::Tensor & result, at::Tensor & info); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_scatter.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_scatter.h new file mode 100644 index 0000000000000000000000000000000000000000..b1725e41270dc52130b80688516299d4d4c26910 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_scatter.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor +inline at::Tensor masked_scatter(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source) { + return at::_ops::masked_scatter::call(self, mask, source); +} + +// aten::masked_scatter.out(Tensor self, Tensor mask, Tensor source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & masked_scatter_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source) { + return at::_ops::masked_scatter_out::call(self, mask, source, out); +} +// aten::masked_scatter.out(Tensor self, Tensor mask, Tensor source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & masked_scatter_outf(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source, at::Tensor & out) { + return at::_ops::masked_scatter_out::call(self, mask, source, out); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_max_pool3d.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_max_pool3d.h new file mode 100644 index 0000000000000000000000000000000000000000..05f82740347eb7bc620a57be0b1b7a2fd6242946 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_max_pool3d.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::mkldnn_max_pool3d(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False) -> Tensor +inline at::Tensor mkldnn_max_pool3d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) { + return at::_ops::mkldnn_max_pool3d::call(self, kernel_size, stride, padding, dilation, ceil_mode); +} + +// aten::mkldnn_max_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_max_pool3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) { + return at::_ops::mkldnn_max_pool3d_out::call(self, kernel_size, stride, padding, dilation, ceil_mode, out); +} +// aten::mkldnn_max_pool3d.out(Tensor self, int[3] kernel_size, int[3] stride=[], int[3] padding=0, int[3] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & mkldnn_max_pool3d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) { + return at::_ops::mkldnn_max_pool3d_out::call(self, kernel_size, stride, padding, dilation, ceil_mode, out); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mvlgamma_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mvlgamma_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..be28c123efab9206c637ea22095cb6a9a22a2842 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mvlgamma_cuda_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & mvlgamma_out(at::Tensor & out, const at::Tensor & self, int64_t p); +TORCH_API at::Tensor & mvlgamma_outf(const at::Tensor & self, int64_t p, at::Tensor & out); + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_batch_norm_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_batch_norm_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..06c902f79187c28e28bdd50f894e4fe4c9a1acf5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_batch_norm_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API native_batch_norm { + using schema = ::std::tuple (const at::Tensor &, const c10::optional &, const c10::optional &, const c10::optional &, const c10::optional &, bool, double, double); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::native_batch_norm") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)") + static ::std::tuple call(const at::Tensor & input, const c10::optional & weight, const c10::optional & bias, const c10::optional & running_mean, const c10::optional & running_var, bool training, double momentum, double eps); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional & weight, const c10::optional & bias, const c10::optional & running_mean, const c10::optional & running_var, bool training, double momentum, double eps); +}; + +struct TORCH_API native_batch_norm_out { + using schema = ::std::tuple (const at::Tensor &, const c10::optional &, const c10::optional &, const c10::optional &, const c10::optional &, bool, double, double, at::Tensor &, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::native_batch_norm") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "native_batch_norm.out(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, *, Tensor(a!) out, Tensor(b!) save_mean, Tensor(c!) save_invstd) -> (Tensor(a!), Tensor(b!), Tensor(c!))") + static ::std::tuple call(const at::Tensor & input, const c10::optional & weight, const c10::optional & bias, const c10::optional & running_mean, const c10::optional & running_var, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const c10::optional & weight, const c10::optional & bias, const c10::optional & running_mean, const c10::optional & running_var, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_layer_norm_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_layer_norm_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..06499cf517c33e7b8c9b92a1d5588899bb4315e6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_layer_norm_cpu_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional & weight, const c10::optional & bias, double eps); +TORCH_API ::std::tuple native_layer_norm_symint(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional & weight, const c10::optional & bias, double eps); + +} // namespace cpu +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_norm.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..8c6fd8aa8ee1871406e7f1f9efca65aa6b574c9e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_norm.h @@ -0,0 +1,53 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::native_norm(Tensor self, Scalar p=2) -> Tensor +inline at::Tensor native_norm(const at::Tensor & self, const at::Scalar & p=2) { + return at::_ops::native_norm::call(self, p); +} + +// aten::native_norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype) -> Tensor +inline at::Tensor native_norm(const at::Tensor & self, const c10::optional & p, at::IntArrayRef dim, bool keepdim, c10::optional dtype) { + return at::_ops::native_norm_ScalarOpt_dim_dtype::call(self, p, dim, keepdim, dtype); +} + +// aten::native_norm.out(Tensor self, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & native_norm_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & p=2) { + return at::_ops::native_norm_out::call(self, p, out); +} +// aten::native_norm.out(Tensor self, Scalar p=2, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & native_norm_outf(const at::Tensor & self, const at::Scalar & p, at::Tensor & out) { + return at::_ops::native_norm_out::call(self, p, out); +} + +// aten::native_norm.ScalarOpt_dim_dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & native_norm_out(at::Tensor & out, const at::Tensor & self, const c10::optional & p, at::IntArrayRef dim, bool keepdim, c10::optional dtype) { + return at::_ops::native_norm_ScalarOpt_dim_dtype_out::call(self, p, dim, keepdim, dtype, out); +} +// aten::native_norm.ScalarOpt_dim_dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, ScalarType? dtype, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & native_norm_outf(const at::Tensor & self, const c10::optional & p, at::IntArrayRef dim, bool keepdim, c10::optional dtype, at::Tensor & out) { + return at::_ops::native_norm_ScalarOpt_dim_dtype_out::call(self, p, dim, keepdim, dtype, out); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_forward_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_forward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3579d56299e902872ad9f8eb9faefd56a963dfce --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_forward_cpu_dispatch.h @@ -0,0 +1,28 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple nll_loss_forward(const at::Tensor & self, const at::Tensor & target, const c10::optional & weight, int64_t reduction, int64_t ignore_index); +TORCH_API ::std::tuple nll_loss_forward_symint(const at::Tensor & self, const at::Tensor & target, const c10::optional & weight, int64_t reduction, c10::SymInt ignore_index); +TORCH_API ::std::tuple nll_loss_forward_out(at::Tensor & output, at::Tensor & total_weight, const at::Tensor & self, const at::Tensor & target, const c10::optional & weight, int64_t reduction, int64_t ignore_index); +TORCH_API ::std::tuple nll_loss_forward_outf(const at::Tensor & self, const at::Tensor & target, const c10::optional & weight, int64_t reduction, int64_t ignore_index, at::Tensor & output, at::Tensor & total_weight); +TORCH_API ::std::tuple nll_loss_forward_symint_out(at::Tensor & output, at::Tensor & total_weight, const at::Tensor & self, const at::Tensor & target, const c10::optional & weight, int64_t reduction, c10::SymInt ignore_index); +TORCH_API ::std::tuple nll_loss_forward_symint_outf(const at::Tensor & self, const at::Tensor & target, const c10::optional & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & output, at::Tensor & total_weight); + +} // namespace cpu +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nuclear_norm_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nuclear_norm_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..ef594d0dbbe0ce77d09390f9ebc5a173a1c8f87f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nuclear_norm_ops.h @@ -0,0 +1,61 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API nuclear_norm { + using schema = at::Tensor (const at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::nuclear_norm") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "nuclear_norm(Tensor self, bool keepdim=False) -> Tensor") + static at::Tensor call(const at::Tensor & self, bool keepdim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool keepdim); +}; + +struct TORCH_API nuclear_norm_out { + using schema = at::Tensor & (const at::Tensor &, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::nuclear_norm") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "nuclear_norm.out(Tensor self, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & self, bool keepdim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, bool keepdim, at::Tensor & out); +}; + +struct TORCH_API nuclear_norm_dim { + using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::nuclear_norm") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "dim") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "nuclear_norm.dim(Tensor self, int[2] dim, bool keepdim=False) -> Tensor") + static at::Tensor call(const at::Tensor & self, at::IntArrayRef dim, bool keepdim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim); +}; + +struct TORCH_API nuclear_norm_dim_out { + using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::nuclear_norm") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "dim_out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/put_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/put_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..42c0e610cfbbe4173e5a08e03034ccc05a0b0713 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/put_cuda_dispatch.h @@ -0,0 +1,23 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor & put_(at::Tensor & self, const at::Tensor & index, const at::Tensor & source, bool accumulate=false); + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reflection_pad2d.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reflection_pad2d.h new file mode 100644 index 0000000000000000000000000000000000000000..efda45d7d74fc3c17ef35d567c92e170705528f6 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reflection_pad2d.h @@ -0,0 +1,91 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & reflection_pad2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad2d_out::call(self, c10::fromIntArrayRefSlow(padding), out); +} +namespace symint { + template ::value>> + at::Tensor & reflection_pad2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad2d_out::call(self, c10::fromIntArrayRefSlow(padding), out); + } +} + +// aten::reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & reflection_pad2d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) { + return at::_ops::reflection_pad2d_out::call(self, c10::fromIntArrayRefSlow(padding), out); +} +namespace symint { + template ::value>> + at::Tensor & reflection_pad2d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) { + return at::_ops::reflection_pad2d_out::call(self, c10::fromIntArrayRefSlow(padding), out); + } +} + +// aten::reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & reflection_pad2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad2d_out::call(self, padding, out); +} +namespace symint { + template ::value>> + at::Tensor & reflection_pad2d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad2d_out::call(self, padding, out); + } +} + +// aten::reflection_pad2d.out(Tensor self, SymInt[4] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & reflection_pad2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) { + return at::_ops::reflection_pad2d_out::call(self, padding, out); +} +namespace symint { + template ::value>> + at::Tensor & reflection_pad2d_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) { + return at::_ops::reflection_pad2d_out::call(self, padding, out); + } +} + +// aten::reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor +inline at::Tensor reflection_pad2d(const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad2d::call(self, c10::fromIntArrayRefSlow(padding)); +} +namespace symint { + template ::value>> + at::Tensor reflection_pad2d(const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::reflection_pad2d::call(self, c10::fromIntArrayRefSlow(padding)); + } +} + +// aten::reflection_pad2d(Tensor self, SymInt[4] padding) -> Tensor +inline at::Tensor reflection_pad2d_symint(const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad2d::call(self, padding); +} +namespace symint { + template ::value>> + at::Tensor reflection_pad2d(const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::reflection_pad2d::call(self, padding); + } +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d.h new file mode 100644 index 0000000000000000000000000000000000000000..fdeefab11dab80f2f4dfe7143180d85311a1209a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d.h @@ -0,0 +1,91 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); +} +namespace symint { + template ::value>> + at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); + } +} + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); +} +namespace symint { + template ::value>> + at::Tensor & replication_pad1d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); + } +} + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, padding, out); +} +namespace symint { + template ::value>> + at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, padding, out); + } +} + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, padding, out); +} +namespace symint { + template ::value>> + at::Tensor & replication_pad1d_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, padding, out); + } +} + +// aten::replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor +inline at::Tensor replication_pad1d(const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, c10::fromIntArrayRefSlow(padding)); +} +namespace symint { + template ::value>> + at::Tensor replication_pad1d(const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, c10::fromIntArrayRefSlow(padding)); + } +} + +// aten::replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor +inline at::Tensor replication_pad1d_symint(const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, padding); +} +namespace symint { + template ::value>> + at::Tensor replication_pad1d(const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, padding); + } +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d_backward_compositeexplicitautogradnonfunctional_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d_backward_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..2b40e647bb0e21a2d56a120d10b8427dfc2100a4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d_backward_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor replication_pad1d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding); +TORCH_API at::Tensor replication_pad1d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..404b5a485c15dcd972c9271caf6e4e0871f4d15e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_meta.h @@ -0,0 +1,27 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_slow_conv_transpose2d : public at::impl::MetaBase { + + + void meta(const at::Tensor & self, const at::Tensor & weight, at::ArrayRef kernel_size, at::OptionalTensorRef bias, at::ArrayRef stride, at::ArrayRef padding, at::ArrayRef output_padding, at::ArrayRef dilation); +}; + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_transpose3d_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_transpose3d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..268026e3646dac3b2269b67e02cb1a0ccd9d4652 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_transpose3d_cpu_dispatch.h @@ -0,0 +1,28 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor slow_conv_transpose3d(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, at::IntArrayRef dilation=1); +TORCH_API at::Tensor slow_conv_transpose3d_symint(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)); +TORCH_API at::Tensor & slow_conv_transpose3d_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, at::IntArrayRef dilation=1); +TORCH_API at::Tensor & slow_conv_transpose3d_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef output_padding, at::IntArrayRef dilation, at::Tensor & out); +TORCH_API at::Tensor & slow_conv_transpose3d_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1)); +TORCH_API at::Tensor & slow_conv_transpose3d_symint_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef output_padding, c10::SymIntArrayRef dilation, at::Tensor & out); + +} // namespace cpu +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softplus_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softplus_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ad4380868908401b3764570aefeb27d1bf7e8cee --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softplus_meta_dispatch.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor softplus(const at::Tensor & self, const at::Scalar & beta=1, const at::Scalar & threshold=20); +TORCH_API at::Tensor & softplus_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & beta=1, const at::Scalar & threshold=20); +TORCH_API at::Tensor & softplus_outf(const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & out); + +} // namespace meta +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_hermite_polynomial_he_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_hermite_polynomial_he_native.h new file mode 100644 index 0000000000000000000000000000000000000000..476663801cc3b3e1ffbf6c462394c7c385aba126 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_hermite_polynomial_he_native.h @@ -0,0 +1,27 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +struct TORCH_API structured_special_hermite_polynomial_he_out : public at::meta::structured_special_hermite_polynomial_he { +void impl(const at::Tensor & x, const at::Tensor & n, const at::Tensor & out); +}; +TORCH_API at::Tensor special_hermite_polynomial_he(const at::Scalar & x, const at::Tensor & n); +TORCH_API at::Tensor & special_hermite_polynomial_he_out(const at::Scalar & x, const at::Tensor & n, at::Tensor & out); +TORCH_API at::Tensor special_hermite_polynomial_he(const at::Tensor & x, const at::Scalar & n); +TORCH_API at::Tensor & special_hermite_polynomial_he_out(const at::Tensor & x, const at::Scalar & n, at::Tensor & out); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_modified_bessel_i0.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_modified_bessel_i0.h new file mode 100644 index 0000000000000000000000000000000000000000..a3350c223596f94fc6fff6126fc30514c3bf535f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_modified_bessel_i0.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::special_modified_bessel_i0(Tensor self) -> Tensor +inline at::Tensor special_modified_bessel_i0(const at::Tensor & self) { + return at::_ops::special_modified_bessel_i0::call(self); +} + +// aten::special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_modified_bessel_i0_out(at::Tensor & out, const at::Tensor & self) { + return at::_ops::special_modified_bessel_i0_out::call(self, out); +} +// aten::special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & special_modified_bessel_i0_outf(const at::Tensor & self, at::Tensor & out) { + return at::_ops::special_modified_bessel_i0_out::call(self, out); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/values_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/values_native.h new file mode 100644 index 0000000000000000000000000000000000000000..ded0053bfa9bb88947efd0cb3ecc7c4e75de0271 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/values_native.h @@ -0,0 +1,24 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor values_default(const at::Tensor & self); +TORCH_API at::Tensor values_nested(const at::Tensor & self); +TORCH_API at::Tensor values_sparse(const at::Tensor & self); +TORCH_API at::Tensor values_sparse_csr(const at::Tensor & self); +} // namespace native +} // namespace at