Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/CopyKernel.h +12 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Reduce.h +314 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SerialStackImpl.h +144 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/StackKernel.h +12 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/int_mm_kernel.h +16 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/MPSGraphVenturaOps.h +197 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorFactories.h +7 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorMath.h +81 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerUtils.h +44 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorUtils.h +415 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_add_relu_meta_dispatch.h +24 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cslt_compress_cuda_dispatch.h +23 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fft_c2r.h +91 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_addcdiv_compositeexplicitautograd_dispatch.h +28 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_cosh_ops.h +50 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_erf.h +44 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_minimum.h +82 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_trunc.h +44 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fw_primal.h +26 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_int_mm_cuda_dispatch.h +25 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_compositeexplicitautograd_dispatch.h +24 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_view_from_buffer_cpu_dispatch.h +23 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_remove_batch_dim.h +30 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_segment_reduce_backward_cpu_dispatch.h +23 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_bsc_tensor_unsafe_native.h +21 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe_compositeimplicitautograd_dispatch.h +26 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_thnn_differentiable_lstm_cell_backward_native.h +21 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_bsr_ops.h +39 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_cuda_dispatch.h +24 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward.h +91 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/all_ops.h +105 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arctan2_ops.h +50 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/asin_compositeexplicitautogradnonfunctional_dispatch.h +24 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/avg_pool2d_backward_native.h +28 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_xor_meta.h +27 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ceil_meta.h +27 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/clamp_ops.h +83 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/convolution_backward.h +91 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/convolution_backward_cuda_dispatch.h +24 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_add_relu.h +91 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_add_relu_native.h +22 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_native.h +22 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/eye_compositeexplicitautograd_dispatch.h +30 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fbgemm_pack_gemm_matrix_fp16.h +30 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_irfftn_ops.h +39 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fractional_max_pool3d_cpu_dispatch.h +25 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/glu_jvp_ops.h +39 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hardswish_backward_cpu_dispatch.h +23 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hardswish_ops.h +50 -0
- tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_put_compositeexplicitautograd_dispatch.h +26 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/CopyKernel.h
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
namespace at {
|
| 4 |
+
struct TensorIteratorBase;
|
| 5 |
+
|
| 6 |
+
namespace native {
|
| 7 |
+
inline namespace CPU_CAPABILITY {
|
| 8 |
+
|
| 9 |
+
void direct_copy_kernel(TensorIteratorBase &iter);
|
| 10 |
+
void copy_kernel(TensorIterator& iter, bool /*non_blocking*/);
|
| 11 |
+
|
| 12 |
+
}}} // namespace at::native::CPU_CAPABILITY
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Reduce.h
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <ATen/native/cpu/Loops.h>
|
| 4 |
+
#include <ATen/Parallel.h>
|
| 5 |
+
#include <c10/util/TypeList.h>
|
| 6 |
+
#include <c10/core/Scalar.h>
|
| 7 |
+
#include <c10/util/irange.h>
|
| 8 |
+
|
| 9 |
+
#include <sstream>
|
| 10 |
+
#include <type_traits>
|
| 11 |
+
|
| 12 |
+
namespace at { namespace native { inline namespace CPU_CAPABILITY {
|
| 13 |
+
|
| 14 |
+
using namespace vec;
|
| 15 |
+
|
| 16 |
+
#define VEC_LOOP_HEADER(func_t, data) \
|
| 17 |
+
using scalar_t = typename function_traits<func_t>::result_type; \
|
| 18 |
+
using Vec = Vectorized<scalar_t>; \
|
| 19 |
+
char* out_ptr = data[0]; \
|
| 20 |
+
(void) out_ptr;
|
| 21 |
+
|
| 22 |
+
// reduction that is contiguous over the input in dim 0
|
| 23 |
+
template <typename traits>
|
| 24 |
+
static inline bool is_contiguous_reduction(const int64_t* strides) {
|
| 25 |
+
return strides[0] == 0 &&
|
| 26 |
+
strides[1] == sizeof(typename traits::arg2_t);
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
// reduction that is contiguous over the input in dim 1
|
| 30 |
+
template <typename traits>
|
| 31 |
+
static inline bool is_outer_reduction(const int64_t* strides) {
|
| 32 |
+
return strides[0] == 0 &&
|
| 33 |
+
strides[2] == sizeof(typename traits::result_type) &&
|
| 34 |
+
strides[3] == sizeof(typename traits::arg2_t);
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
template <typename func_t, typename vec_func_t>
|
| 38 |
+
static inline void vectorized_reduction(char** data, int64_t n, int64_t stride,
|
| 39 |
+
func_t op, vec_func_t vop, bool reduce) {
|
| 40 |
+
VEC_LOOP_HEADER(func_t, data)
|
| 41 |
+
const char* in1_ptr = data[1];
|
| 42 |
+
Vec acc[4];
|
| 43 |
+
for (const auto j : c10::irange(4)) {
|
| 44 |
+
acc[j] = Vec::loadu(in1_ptr + j * Vec::size() * sizeof(scalar_t));
|
| 45 |
+
}
|
| 46 |
+
for (const auto i : c10::irange(1, n)) {
|
| 47 |
+
const char* ptr = in1_ptr + stride * i;
|
| 48 |
+
acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t))));
|
| 49 |
+
acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t))));
|
| 50 |
+
acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t))));
|
| 51 |
+
acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t))));
|
| 52 |
+
}
|
| 53 |
+
if (reduce) {
|
| 54 |
+
scalar_t buffer[Vec::size()];
|
| 55 |
+
acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3]));
|
| 56 |
+
acc[0].store(buffer);
|
| 57 |
+
for (const auto j : c10::irange(1, Vec::size())) {
|
| 58 |
+
buffer[0] = op(buffer[0], buffer[j]);
|
| 59 |
+
}
|
| 60 |
+
auto dst = (scalar_t*)out_ptr;
|
| 61 |
+
*dst = op(*dst, buffer[0]);
|
| 62 |
+
} else {
|
| 63 |
+
for (const auto j : c10::irange(4)) {
|
| 64 |
+
auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t);
|
| 65 |
+
acc[j] = vop(acc[j], Vec::loadu(dst));
|
| 66 |
+
acc[j].store(dst);
|
| 67 |
+
}
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
template <typename F>
|
| 72 |
+
static inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int64_t n, F f) {
|
| 73 |
+
for (const auto j C10_UNUSED : c10::irange(n)) {
|
| 74 |
+
f();
|
| 75 |
+
data[0] += strides[0];
|
| 76 |
+
data[1] += strides[1];
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
// computes the reduction out = op(out, in)
|
| 81 |
+
template <typename func_t, typename vec_func_t>
|
| 82 |
+
static inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) {
|
| 83 |
+
VEC_LOOP_HEADER(func_t, data)
|
| 84 |
+
int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
|
| 85 |
+
int64_t count = n / (4 * Vec::size());
|
| 86 |
+
if (count > 0) {
|
| 87 |
+
vectorized_reduction(data, count, vector_stride, op, vop, /*reduce=*/true);
|
| 88 |
+
}
|
| 89 |
+
char* ptrs[3] = { data[0], data[0], data[1] };
|
| 90 |
+
int64_t strides[] = { 0, 0, sizeof(scalar_t) };
|
| 91 |
+
basic_loop(ptrs, strides, count * 4 * Vec::size(), n, op);
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
// computes the reduction out = op(out, in)
|
| 95 |
+
template <typename func_t, typename vec_func_t>
|
| 96 |
+
static inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) {
|
| 97 |
+
VEC_LOOP_HEADER(func_t, data)
|
| 98 |
+
|
| 99 |
+
// reduce down each column of 4 * Vec::size() elements (128 or 256 bytes)
|
| 100 |
+
#if defined(CPU_CAPABILITY_AVX512)
|
| 101 |
+
int64_t outer_stride[2] = { 256, 256 };
|
| 102 |
+
#else
|
| 103 |
+
int64_t outer_stride[2] = { 128, 128 };
|
| 104 |
+
#endif
|
| 105 |
+
UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] {
|
| 106 |
+
vectorized_reduction(data, size0, inner_stride, op, vop, /*reduce=*/false);
|
| 107 |
+
});
|
| 108 |
+
|
| 109 |
+
// reduce down the remaining columns
|
| 110 |
+
int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) };
|
| 111 |
+
int64_t remaining = size1 % (4 * Vec::size());
|
| 112 |
+
UNARY_OUTER_LOOP(data, step, remaining, [&] {
|
| 113 |
+
char* ptrs[3] = { data[0], data[0], data[1] };
|
| 114 |
+
int64_t strides[] = { 0, 0, inner_stride };
|
| 115 |
+
basic_loop(ptrs, strides, 0, size0, op);
|
| 116 |
+
});
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
template<typename traits, typename res_t>
|
| 120 |
+
static void set_result(const int index, const res_t result, const TensorIteratorBase &iter, const int num_outputs) {
|
| 121 |
+
// static_assert(std::is_same<res_t, typename traits::arg2_t>::value, "data types must match");
|
| 122 |
+
if (index < num_outputs) {
|
| 123 |
+
char *out = (char *) iter.data_ptr(index);
|
| 124 |
+
*(res_t *) out = result;
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
template<typename traits, typename res_t>
|
| 129 |
+
static void set_results(const res_t result, const TensorIteratorBase &iter, const int num_outputs) {
|
| 130 |
+
AT_ASSERT(num_outputs == 1);
|
| 131 |
+
set_result<traits>(0, result, iter, num_outputs);
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
template<typename traits, std::size_t i = 0, typename... tuple_t>
|
| 135 |
+
static inline typename std::enable_if<i == sizeof...(tuple_t), std::size_t>::type
|
| 136 |
+
for_each_in_tuple(const std::tuple<tuple_t...>& /*t*/, const TensorIteratorBase& /*iter*/, const int /*num_outputs*/) {
|
| 137 |
+
return i;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
template<typename traits, std::size_t i = 0, typename... tuple_t>
|
| 141 |
+
static inline typename std::enable_if<i < sizeof...(tuple_t), std::size_t>::type
|
| 142 |
+
for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIteratorBase &iter, const int num_outputs) {
|
| 143 |
+
if (i < (size_t)num_outputs) {
|
| 144 |
+
set_result<traits>(i, std::get<i>(t), iter, num_outputs);
|
| 145 |
+
return for_each_in_tuple<traits, i + 1, tuple_t...>(t, iter, num_outputs);
|
| 146 |
+
}
|
| 147 |
+
return i;
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
template<typename traits, typename... res_t>
|
| 151 |
+
static void set_results(const std::tuple<res_t...>& result, const TensorIteratorBase &iter, const int num_outputs) {
|
| 152 |
+
AT_ASSERT(num_outputs >= 1);
|
| 153 |
+
std::size_t result_size = for_each_in_tuple<traits>(result, iter, num_outputs);
|
| 154 |
+
AT_ASSERT((size_t)num_outputs == result_size);
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
template <typename T, typename... Args>
|
| 158 |
+
struct all_same : std::conjunction<
|
| 159 |
+
std::is_same<T, Args>...
|
| 160 |
+
> {};
|
| 161 |
+
|
| 162 |
+
// data_t is the input/output data type.
|
| 163 |
+
// acc_t is a type that contains all the necessary data
|
| 164 |
+
// to continue reducing.
|
| 165 |
+
// index_t is a one-dimensional index
|
| 166 |
+
//
|
| 167 |
+
// ops_t is such that &ops_t::reduce, &ops_t::combine, and &ops_t::project exist and satisfy
|
| 168 |
+
// the following.
|
| 169 |
+
// reduce: (acc_t, data_t, index_t) -> acc_t adds one data point to the accumulated value.
|
| 170 |
+
// combine: (acc_t, acc_t) -> acc_t combines two accumulated values into one.
|
| 171 |
+
// project: acc_t -> out_t finishes the reduction, getting the required output.
|
| 172 |
+
//
|
| 173 |
+
// Additionally, acc_t must be default-constructible:
|
| 174 |
+
// acc_t {} is an identity for combine,
|
| 175 |
+
// and project(acc_t {}) is the value of the operation on zero elements.
|
| 176 |
+
//
|
| 177 |
+
// The point of `combine` is to support parallelization -
|
| 178 |
+
// the idea is to one sequence of `reduce` calls per thread of execution,
|
| 179 |
+
// and then to combine them at the end with `combine`.
|
| 180 |
+
//
|
| 181 |
+
// If there is more than one output element,
|
| 182 |
+
// our parallelization strategy is to use one thread for each of them,
|
| 183 |
+
// which means that `combine` will never be called.
|
| 184 |
+
//
|
| 185 |
+
// If, on the other hand, there is only one, then we split the input into
|
| 186 |
+
// into several pieces, reduce each separately, and then combine them.
|
| 187 |
+
|
| 188 |
+
template <typename ops_t, typename init_t>
|
| 189 |
+
void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) {
|
| 190 |
+
using rf_t = decltype(&ops_t::reduce);
|
| 191 |
+
using cf_t = decltype(&ops_t::combine);
|
| 192 |
+
using pf_t = decltype(&ops_t::project);
|
| 193 |
+
using r_traits = binary_function_traits<rf_t>;
|
| 194 |
+
using c_traits = binary_function_traits<cf_t>;
|
| 195 |
+
using p_traits = unary_function_traits<pf_t>;
|
| 196 |
+
using acc_t = typename p_traits::arg1_t;
|
| 197 |
+
using data_t = typename r_traits::arg2_t;
|
| 198 |
+
static_assert(
|
| 199 |
+
all_same<
|
| 200 |
+
acc_t,
|
| 201 |
+
init_t,
|
| 202 |
+
typename r_traits::arg1_t,
|
| 203 |
+
typename r_traits::result_type,
|
| 204 |
+
typename c_traits::arg1_t,
|
| 205 |
+
typename c_traits::arg2_t,
|
| 206 |
+
typename c_traits::result_type>::value,
|
| 207 |
+
"all accumulate types must match");
|
| 208 |
+
static_assert(
|
| 209 |
+
std::is_default_constructible<acc_t>::value,
|
| 210 |
+
"the accumulate type must be default-constructible"
|
| 211 |
+
);
|
| 212 |
+
const int num_outputs = iter.noutputs();
|
| 213 |
+
iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIteratorBase &sub_iter) {
|
| 214 |
+
auto reduction_body = [&ops, &sub_iter, num_outputs](acc_t acc, int64_t begin, int64_t end) -> acc_t {
|
| 215 |
+
int ntensors = sub_iter.ntensors();
|
| 216 |
+
sub_iter.serial_for_each([&acc, &ops, num_outputs, ntensors, begin](char** data, const int64_t* strides, int64_t size) {
|
| 217 |
+
AT_ASSERT(ntensors - num_outputs == 1);
|
| 218 |
+
char *in = data[ntensors - 1];
|
| 219 |
+
int64_t stride = strides[ntensors - 1];
|
| 220 |
+
for (const auto i : c10::irange(size)) {
|
| 221 |
+
acc = ops.reduce(acc, c10::load<data_t>(in), begin + i);
|
| 222 |
+
in += stride;
|
| 223 |
+
}
|
| 224 |
+
}, {begin, end});
|
| 225 |
+
return ops.translate_idx(acc, sub_iter.view_offsets()[0]);
|
| 226 |
+
};
|
| 227 |
+
acc_t total_acc = init;
|
| 228 |
+
auto numel = sub_iter.numel();
|
| 229 |
+
if (numel < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
|
| 230 |
+
at::in_parallel_region()) {
|
| 231 |
+
total_acc = reduction_body(total_acc, 0, numel);
|
| 232 |
+
} else {
|
| 233 |
+
int max_threads = at::get_num_threads();
|
| 234 |
+
AT_ASSERT(max_threads > 0);
|
| 235 |
+
static_assert(
|
| 236 |
+
!std::is_same<acc_t, bool>::value,
|
| 237 |
+
"Concurrently modifying different references into std::vector<bool> is UB."
|
| 238 |
+
);
|
| 239 |
+
std::vector<acc_t> buffer((unsigned)max_threads, init);
|
| 240 |
+
at::parallel_for(0, numel, internal::GRAIN_SIZE,
|
| 241 |
+
[&](int64_t begin, int64_t end) {
|
| 242 |
+
auto& acc = buffer[at::get_thread_num()];
|
| 243 |
+
acc = reduction_body(acc, begin, end);
|
| 244 |
+
}
|
| 245 |
+
);
|
| 246 |
+
for (const auto i : c10::irange(max_threads)) {
|
| 247 |
+
total_acc = ops.combine(total_acc, buffer[i]);
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
set_results<r_traits>(ops.project(total_acc), sub_iter, num_outputs);
|
| 251 |
+
});
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
template <typename func_t, typename vec_func_t>
|
| 255 |
+
void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, double ident = 0) {
|
| 256 |
+
using traits = binary_function_traits<func_t>;
|
| 257 |
+
static_assert(
|
| 258 |
+
all_same<
|
| 259 |
+
typename traits::result_type,
|
| 260 |
+
typename traits::arg1_t,
|
| 261 |
+
typename traits::arg2_t>::value,
|
| 262 |
+
"all types must match");
|
| 263 |
+
|
| 264 |
+
iter.output_base().fill_(ident);
|
| 265 |
+
iter.parallel_reduce([&](char** data, const int64_t* strides, int64_t size0, int64_t size1) {
|
| 266 |
+
int64_t outer_strides[] = { strides[2], strides[3] };
|
| 267 |
+
if (is_contiguous_reduction<traits>(strides)) {
|
| 268 |
+
// input is contiguous in dim 0, output is reduced in dim 0
|
| 269 |
+
UNARY_OUTER_LOOP(data, outer_strides, size1, [&] {
|
| 270 |
+
vectorized_inner_reduction(data, size0, op, vop);
|
| 271 |
+
});
|
| 272 |
+
} else if (is_outer_reduction<traits>(strides)) {
|
| 273 |
+
// input and output are contiguous in dim 1
|
| 274 |
+
int64_t inner_stride = strides[1]; // stride of input in dim 0
|
| 275 |
+
vectorized_outer_reduction(data, inner_stride, size0, size1, op, vop);
|
| 276 |
+
} else {
|
| 277 |
+
UNARY_OUTER_LOOP(data, outer_strides, size1, [&] {
|
| 278 |
+
char* ptrs[3] = { data[0], data[0], data[1] };
|
| 279 |
+
int64_t inner_strides[3] = { strides[0], strides[0], strides[1] };
|
| 280 |
+
basic_loop(ptrs, inner_strides, 0, size0, op);
|
| 281 |
+
});
|
| 282 |
+
}
|
| 283 |
+
});
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
// when reduction is on most inner dimension (dim 0 in TensorIterator)
|
| 287 |
+
// and input has contiguous most inner dimension, `binary_kernel_reduce_lastdim`
|
| 288 |
+
// can be used.
|
| 289 |
+
static inline bool is_reduce_lastdim(TensorIteratorBase& iter) {
|
| 290 |
+
return iter.num_reduce_dims() == 1 && iter.is_dim_reduced(0)
|
| 291 |
+
&& iter.ninputs() == 1 && iter.strides(1)[0] == iter.element_size(1);
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
template <typename reduce_func_t>
|
| 295 |
+
void binary_kernel_reduce_lastdim(TensorIteratorBase& iter, reduce_func_t reduce_op) {
|
| 296 |
+
auto shape = iter.shape();
|
| 297 |
+
int64_t dim_size = shape[0];
|
| 298 |
+
int64_t grain_size = std::max((int64_t) 1, at::internal::GRAIN_SIZE / dim_size);
|
| 299 |
+
TensorIterator sub_iter(iter);
|
| 300 |
+
// create sub iterator to parallel on all non-reduce-dims
|
| 301 |
+
sub_iter.narrow(0, 0, 1);
|
| 302 |
+
auto loop = [&](char** data, const int64_t* strides, int64_t size) {
|
| 303 |
+
char* out = data[0];
|
| 304 |
+
char* in = data[1];
|
| 305 |
+
for (int64_t i = 0; i < size; ++i) {
|
| 306 |
+
reduce_op(out, in, dim_size);
|
| 307 |
+
out += strides[0];
|
| 308 |
+
in += strides[1];
|
| 309 |
+
}
|
| 310 |
+
};
|
| 311 |
+
sub_iter.for_each(loop, grain_size);
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
}}} // namespace at::native::<anonymous>
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SerialStackImpl.h
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Copyright 2004-present Facebook. All Rights Reserved.
|
| 2 |
+
#pragma once
|
| 3 |
+
|
| 4 |
+
#include <ATen/core/Tensor.h>
|
| 5 |
+
|
| 6 |
+
#include <ATen/MemoryOverlap.h>
|
| 7 |
+
#include <ATen/Parallel.h>
|
| 8 |
+
#include <ATen/TensorIterator.h>
|
| 9 |
+
#include <ATen/cpu/vec/functional.h>
|
| 10 |
+
#include <ATen/cpu/vec/vec.h>
|
| 11 |
+
#include <c10/util/irange.h>
|
| 12 |
+
|
| 13 |
+
namespace at { namespace native { namespace detail {
|
| 14 |
+
|
| 15 |
+
struct InputMeta {
|
| 16 |
+
void* data_ptr;
|
| 17 |
+
int64_t inner_size;
|
| 18 |
+
|
| 19 |
+
InputMeta(const Tensor& t, int64_t dim, int64_t inner)
|
| 20 |
+
: data_ptr(t.data_ptr()), inner_size(t.sizes()[dim] * inner) {}
|
| 21 |
+
};
|
| 22 |
+
|
| 23 |
+
// This kernel is used by two TensorList types:
|
| 24 |
+
// 1. stack_serial_kernel uses at::ArrayRef<Tensor>
|
| 25 |
+
// 2. Static runtime calls this kernel directly (csrc/jit/runtime/static/ops.cpp) with
|
| 26 |
+
// ProcessedNodeInputWrapper.
|
| 27 |
+
// When making changes, make sure that they are compatible with both types!
|
| 28 |
+
template <typename scalar_t, typename TensorListType>
|
| 29 |
+
void stack_serial_kernel_impl(Tensor& result, TensorListType tensors, int64_t dim) {
|
| 30 |
+
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
| 31 |
+
dim >= 0 && dim <= result.dim(),
|
| 32 |
+
"dim out of range in stack_serial_kernel_impl");
|
| 33 |
+
int64_t outer =
|
| 34 |
+
result.numel() / (result.sizes()[dim] * result.strides()[dim]);
|
| 35 |
+
scalar_t* result_data = result.data_ptr<scalar_t>();
|
| 36 |
+
int64_t ninputs = tensors.size();
|
| 37 |
+
std::vector<InputMeta> inputs;
|
| 38 |
+
inputs.reserve(ninputs);
|
| 39 |
+
for (const auto& tensor : tensors) {
|
| 40 |
+
inputs.emplace_back(tensor, dim, tensor.strides()[dim]);
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
using Vec = vec::Vectorized<scalar_t>;
|
| 44 |
+
scalar_t* result_ptr = result_data;
|
| 45 |
+
for (const auto i : c10::irange(outer)) {
|
| 46 |
+
for (const auto j : c10::irange(ninputs)) {
|
| 47 |
+
int64_t local_inner = inputs[j].inner_size;
|
| 48 |
+
scalar_t* input_ptr = (scalar_t*)(inputs[j].data_ptr) + i * local_inner;
|
| 49 |
+
|
| 50 |
+
if (local_inner < Vec::size()) {
|
| 51 |
+
for (const auto k : c10::irange(local_inner)) {
|
| 52 |
+
result_ptr[k] = input_ptr[k];
|
| 53 |
+
}
|
| 54 |
+
} else {
|
| 55 |
+
vec::map(
|
| 56 |
+
[](Vec x) { return x; }, result_ptr, input_ptr, local_inner);
|
| 57 |
+
}
|
| 58 |
+
result_ptr += local_inner;
|
| 59 |
+
}
|
| 60 |
+
}
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
// Checks to see whether native stack can be invoked under these conditions:
|
| 64 |
+
// - result and input tensors are contiguous
|
| 65 |
+
// - only one thread is used
|
| 66 |
+
// - no type promotion has to occur
|
| 67 |
+
// - tensors dtype is Double or Float
|
| 68 |
+
template <typename TensorListType>
|
| 69 |
+
bool can_use_native_serial_stack_impl(Tensor& result, TensorListType tensors, int64_t dim) {
|
| 70 |
+
TORCH_CHECK(tensors.size() > 0, "expected a non-empty list of Tensors");
|
| 71 |
+
const Tensor& first_tensor = tensors[0];
|
| 72 |
+
// stack dimension should be in range [0,firstTensor.dim())
|
| 73 |
+
// dim == firstTensor.dim() is a valid input, but it is handled by default code path
|
| 74 |
+
// that uses unsqueeze
|
| 75 |
+
if (dim >= first_tensor.dim()) return false;
|
| 76 |
+
// Native stack doesn't apply any tensor is skipped.
|
| 77 |
+
if (first_tensor.numel() == 0 && first_tensor.dim() == 1) return false;
|
| 78 |
+
// there should be no type promotion
|
| 79 |
+
if (result.dtype() != first_tensor.dtype()) return false;
|
| 80 |
+
|
| 81 |
+
auto first_tensor_mem_format = first_tensor.suggest_memory_format();
|
| 82 |
+
ScalarType dtype = first_tensor.scalar_type();
|
| 83 |
+
|
| 84 |
+
if (!result.is_contiguous(first_tensor_mem_format)) {
|
| 85 |
+
return false;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
// fast path only works for Double and Float
|
| 89 |
+
if (dtype != ScalarType::Double && dtype != ScalarType::Float) {
|
| 90 |
+
return false;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
// check remainder of inputs
|
| 94 |
+
auto const &first_tensor_shape = first_tensor.sizes();
|
| 95 |
+
for (const auto i : c10::irange(1, tensors.size())) {
|
| 96 |
+
auto const &tensor = tensors[i];
|
| 97 |
+
TORCH_CHECK(tensors[i].sizes() == first_tensor.sizes(),
|
| 98 |
+
"stack expects each tensor to be equal size, but got ", first_tensor_shape,
|
| 99 |
+
" at entry 0 and ", tensor.sizes(), " at entry ", i);
|
| 100 |
+
|
| 101 |
+
// every tensor must be contiguous
|
| 102 |
+
// tensor sizes and strides must be the same
|
| 103 |
+
// there should be no type promotion
|
| 104 |
+
if (!tensor.is_contiguous(first_tensor_mem_format) ||
|
| 105 |
+
tensor.strides() != first_tensor.strides() ||
|
| 106 |
+
tensor.dtype() != dtype) {
|
| 107 |
+
return false;
|
| 108 |
+
}
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
// fast native stack should only be used when it is not worth using multiple threads
|
| 112 |
+
// or there is only one thread. Note that we aren't checking result.numel() here because
|
| 113 |
+
// it may not have been resized and we want to defer that cost till later.
|
| 114 |
+
int64_t numel_in_stack = first_tensor.numel() * tensors.size();
|
| 115 |
+
return numel_in_stack < at::internal::GRAIN_SIZE || at::get_num_threads() == 1;
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
template <typename TensorListType, bool should_skip_overlap_check>
|
| 119 |
+
struct CanUseNativeSerialStack;
|
| 120 |
+
|
| 121 |
+
template <typename TensorListType>
|
| 122 |
+
struct CanUseNativeSerialStack<TensorListType, false> {
|
| 123 |
+
static bool call(Tensor& result, TensorListType tensors, int64_t dim) {
|
| 124 |
+
// Inputs cannot alias the output tensor
|
| 125 |
+
for (const auto i : c10::irange(tensors.size())) {
|
| 126 |
+
auto lap = at::get_overlap_status(result, tensors[i]);
|
| 127 |
+
TORCH_CHECK(lap != at::MemOverlapStatus::Partial &&
|
| 128 |
+
lap != at::MemOverlapStatus::Full, 0,
|
| 129 |
+
"unsupported operation: the input tensors cannot refer to any of the "
|
| 130 |
+
"output memory locations. Found overlap in input tensor ", i);
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
return can_use_native_serial_stack_impl(result, tensors, dim);
|
| 134 |
+
}
|
| 135 |
+
};
|
| 136 |
+
|
| 137 |
+
template <typename TensorListType>
|
| 138 |
+
struct CanUseNativeSerialStack<TensorListType, true> {
|
| 139 |
+
static bool call(Tensor& result, TensorListType tensors, int64_t dim) {
|
| 140 |
+
return can_use_native_serial_stack_impl(result, tensors, dim);
|
| 141 |
+
}
|
| 142 |
+
};
|
| 143 |
+
|
| 144 |
+
}}} // namespace at::native::detail
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/StackKernel.h
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Copyright 2004-present Facebook. All Rights Reserved.
|
| 2 |
+
#pragma once
|
| 3 |
+
|
| 4 |
+
#include <ATen/core/Tensor.h>
|
| 5 |
+
#include <ATen/native/DispatchStub.h>
|
| 6 |
+
|
| 7 |
+
namespace at { namespace native {
|
| 8 |
+
|
| 9 |
+
using stack_serial_fn = void(*)(Tensor &, TensorList, int64_t);
|
| 10 |
+
DECLARE_DISPATCH(stack_serial_fn, stack_serial_stub);
|
| 11 |
+
|
| 12 |
+
}} // namespace at::native
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/int_mm_kernel.h
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <ATen/core/Tensor.h>
|
| 4 |
+
#include <ATen/native/DispatchStub.h>
|
| 5 |
+
|
| 6 |
+
namespace at::native {
|
| 7 |
+
|
| 8 |
+
using weight_to_int4pack_fn = void(*)(const Tensor&, const Tensor&, int, int);
|
| 9 |
+
using int4pack_mm_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, int, const Tensor&, int, int);
|
| 10 |
+
using int8pack_mm_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&);
|
| 11 |
+
|
| 12 |
+
DECLARE_DISPATCH(weight_to_int4pack_fn, weight_to_int4pack_stub);
|
| 13 |
+
DECLARE_DISPATCH(int4pack_mm_fn, int4pack_mm_stub);
|
| 14 |
+
DECLARE_DISPATCH(int8pack_mm_fn, int8pack_mm_stub);
|
| 15 |
+
|
| 16 |
+
} // namespace at::native
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/MPSGraphVenturaOps.h
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
|
| 3 |
+
|
| 4 |
+
// TODO: Remove me when moved to MacOS 13
|
| 5 |
+
#if !defined(__MAC_13_2) && \
|
| 6 |
+
(!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
|
| 7 |
+
|
| 8 |
+
@interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
|
| 9 |
+
|
| 10 |
+
@property (readwrite, nonatomic) NSUInteger strideInX;
|
| 11 |
+
@property (readwrite, nonatomic) NSUInteger strideInY;
|
| 12 |
+
@property (readwrite, nonatomic) NSUInteger strideInZ;
|
| 13 |
+
@property (readwrite, nonatomic) NSUInteger dilationRateInX;
|
| 14 |
+
@property (readwrite, nonatomic) NSUInteger dilationRateInY;
|
| 15 |
+
@property (readwrite, nonatomic) NSUInteger dilationRateInZ;
|
| 16 |
+
|
| 17 |
+
@property (readwrite, nonatomic) NSUInteger paddingLeft;
|
| 18 |
+
@property (readwrite, nonatomic) NSUInteger paddingRight;
|
| 19 |
+
@property (readwrite, nonatomic) NSUInteger paddingTop;
|
| 20 |
+
@property (readwrite, nonatomic) NSUInteger paddingBottom;
|
| 21 |
+
@property (readwrite, nonatomic) NSUInteger paddingFront;
|
| 22 |
+
@property (readwrite, nonatomic) NSUInteger paddingBack;
|
| 23 |
+
|
| 24 |
+
@property (readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
|
| 25 |
+
@property (readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
|
| 26 |
+
@property (readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
|
| 27 |
+
|
| 28 |
+
@property (readwrite, nonatomic) NSUInteger groups;
|
| 29 |
+
|
| 30 |
+
@end
|
| 31 |
+
|
| 32 |
+
@compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
|
| 33 |
+
|
| 34 |
+
#endif
|
| 35 |
+
|
| 36 |
+
@interface MPSGraph (VenturaOps)
|
| 37 |
+
|
| 38 |
+
#if !defined(__MAC_13_0) && \
|
| 39 |
+
(!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
|
| 40 |
+
|
| 41 |
+
typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode)
|
| 42 |
+
{
|
| 43 |
+
MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L,
|
| 44 |
+
MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L,
|
| 45 |
+
MPSGraphResizeNearestRoundingModeCeil = 2L,
|
| 46 |
+
MPSGraphResizeNearestRoundingModeFloor = 3L,
|
| 47 |
+
MPSGraphResizeNearestRoundingModeRoundToEven = 4L,
|
| 48 |
+
MPSGraphResizeNearestRoundingModeRoundToOdd = 5L,
|
| 49 |
+
};
|
| 50 |
+
|
| 51 |
+
// Define complex enums for MacOS 12
|
| 52 |
+
#define MPSDataTypeComplexBit 0x01000000
|
| 53 |
+
#define MPSDataTypeComplexFloat32 ((MPSDataType) (MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
|
| 54 |
+
#define MPSDataTypeComplexFloat16 ((MPSDataType) (MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
|
| 55 |
+
#endif
|
| 56 |
+
|
| 57 |
+
- (MPSGraphTensor * _Nonnull) convolution3DWithSourceTensor:(MPSGraphTensor * _Nonnull) source
|
| 58 |
+
weightsTensor:(MPSGraphTensor * _Nonnull) weights
|
| 59 |
+
descriptor:(MPSGraphConvolution3DOpDescriptor * _Nonnull) descriptor
|
| 60 |
+
name:(NSString * _Nullable) name;
|
| 61 |
+
|
| 62 |
+
- (MPSGraphTensor * _Nonnull) convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor * _Nonnull) incomingGradient
|
| 63 |
+
weightsTensor:(MPSGraphTensor * _Nonnull) weights
|
| 64 |
+
outputShape:(MPSShape * _Nonnull) outputShape
|
| 65 |
+
forwardConvolutionDescriptor:(MPSGraphConvolution3DOpDescriptor * _Nonnull) forwardConvolutionDescriptor
|
| 66 |
+
name:(NSString * _Nullable) name;
|
| 67 |
+
|
| 68 |
+
- (MPSGraphTensor * _Nonnull) convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor * _Nonnull) incomingGradient
|
| 69 |
+
sourceTensor:(MPSGraphTensor * _Nonnull) source
|
| 70 |
+
outputShape:(MPSShape * _Nonnull) outputShape
|
| 71 |
+
forwardConvolutionDescriptor:(MPSGraphConvolution3DOpDescriptor * _Nonnull) forwardConvolutionDescriptor
|
| 72 |
+
name:(NSString * _Nullable) name;
|
| 73 |
+
|
| 74 |
+
- (MPSGraphTensor * _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor * _Nonnull)tensor
|
| 75 |
+
axis:(NSInteger)axis
|
| 76 |
+
name:(NSString * _Nullable)name;
|
| 77 |
+
|
| 78 |
+
- (MPSGraphTensor * _Nonnull)sortWithTensor:(MPSGraphTensor * _Nonnull)tensor
|
| 79 |
+
axis:(NSInteger)axis
|
| 80 |
+
name:(NSString * _Nullable)name;
|
| 81 |
+
|
| 82 |
+
- (MPSGraphTensor * _Nonnull) sortWithTensor:(MPSGraphTensor * _Nonnull) tensor
|
| 83 |
+
axis:(NSInteger) axis
|
| 84 |
+
descending:(BOOL) descending
|
| 85 |
+
name:(NSString * _Nullable) name;
|
| 86 |
+
|
| 87 |
+
- (MPSGraphTensor * _Nonnull) sortWithTensor:(MPSGraphTensor * _Nonnull) tensor
|
| 88 |
+
axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
|
| 89 |
+
descending:(BOOL) descending
|
| 90 |
+
name:(NSString * _Nullable) name;
|
| 91 |
+
|
| 92 |
+
- (MPSGraphTensor * _Nonnull) sortWithTensor:(MPSGraphTensor * _Nonnull) tensor
|
| 93 |
+
axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
|
| 94 |
+
name:(NSString * _Nullable) name;
|
| 95 |
+
|
| 96 |
+
- (MPSGraphTensor * _Nonnull)argSortWithTensor:(MPSGraphTensor * _Nonnull)tensor
|
| 97 |
+
axis:(NSInteger)axis
|
| 98 |
+
name:(NSString * _Nullable)name;
|
| 99 |
+
|
| 100 |
+
- (MPSGraphTensor * _Nonnull) argSortWithTensor:(MPSGraphTensor * _Nonnull) tensor
|
| 101 |
+
axis:(NSInteger) axis
|
| 102 |
+
descending:(BOOL) descending
|
| 103 |
+
name:(NSString * _Nullable) name;
|
| 104 |
+
|
| 105 |
+
- (MPSGraphTensor * _Nonnull) argSortWithTensor:(MPSGraphTensor * _Nonnull) tensor
|
| 106 |
+
axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
|
| 107 |
+
descending:(BOOL) descending
|
| 108 |
+
name:(NSString * _Nullable) name;
|
| 109 |
+
|
| 110 |
+
- (MPSGraphTensor * _Nonnull) argSortWithTensor:(MPSGraphTensor * _Nonnull) tensor
|
| 111 |
+
axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
|
| 112 |
+
name:(NSString * _Nullable) name;
|
| 113 |
+
|
| 114 |
+
- (MPSGraphTensor * _Nonnull)inverseOfTensor:(MPSGraphTensor * _Nonnull) inputTensor
|
| 115 |
+
name:(NSString * _Nullable)name;
|
| 116 |
+
|
| 117 |
+
- (MPSGraphTensor * _Nonnull) resizeNearestWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
|
| 118 |
+
sizeTensor:(MPSGraphTensor * _Nonnull) size
|
| 119 |
+
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
|
| 120 |
+
centerResult:(BOOL) centerResult
|
| 121 |
+
alignCorners:(BOOL) alignCorners
|
| 122 |
+
layout:(MPSGraphTensorNamedDataLayout) layout
|
| 123 |
+
name:(NSString * _Nullable) name;
|
| 124 |
+
|
| 125 |
+
- (MPSGraphTensor * _Nonnull) resizeNearestWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
|
| 126 |
+
sizeTensor:(MPSGraphTensor * _Nonnull) size
|
| 127 |
+
scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
|
| 128 |
+
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
|
| 129 |
+
layout:(MPSGraphTensorNamedDataLayout) layout
|
| 130 |
+
name:(NSString * _Nullable) name;
|
| 131 |
+
|
| 132 |
+
- (MPSGraphTensor * _Nonnull) resizeBilinearWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
|
| 133 |
+
sizeTensor:(MPSGraphTensor * _Nonnull) size
|
| 134 |
+
centerResult:(BOOL) centerResult
|
| 135 |
+
alignCorners:(BOOL) alignCorners
|
| 136 |
+
layout:(MPSGraphTensorNamedDataLayout) layout
|
| 137 |
+
name:(NSString * _Nullable) name;
|
| 138 |
+
|
| 139 |
+
- (MPSGraphTensor * _Nonnull) resizeBilinearWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
|
| 140 |
+
sizeTensor:(MPSGraphTensor * _Nonnull) size
|
| 141 |
+
scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
|
| 142 |
+
layout:(MPSGraphTensorNamedDataLayout) layout
|
| 143 |
+
name:(NSString * _Nullable) name;
|
| 144 |
+
|
| 145 |
+
- (MPSGraphTensor * _Nonnull) resizeNearestWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
|
| 146 |
+
input:(MPSGraphTensor * _Nonnull) input
|
| 147 |
+
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
|
| 148 |
+
centerResult:(BOOL) centerResult
|
| 149 |
+
alignCorners:(BOOL) alignCorners
|
| 150 |
+
layout:(MPSGraphTensorNamedDataLayout) layout
|
| 151 |
+
name:(NSString * _Nullable) name;
|
| 152 |
+
|
| 153 |
+
- (MPSGraphTensor * _Nonnull) resizeNearestWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
|
| 154 |
+
input:(MPSGraphTensor * _Nonnull) input
|
| 155 |
+
scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
|
| 156 |
+
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
|
| 157 |
+
layout:(MPSGraphTensorNamedDataLayout) layout
|
| 158 |
+
name:(NSString * _Nullable) name;
|
| 159 |
+
|
| 160 |
+
- (MPSGraphTensor * _Nonnull) resizeBilinearWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
|
| 161 |
+
input:(MPSGraphTensor * _Nonnull) input
|
| 162 |
+
centerResult:(BOOL) centerResult
|
| 163 |
+
alignCorners:(BOOL) alignCorners
|
| 164 |
+
layout:(MPSGraphTensorNamedDataLayout) layout
|
| 165 |
+
name:(NSString * _Nullable) name;
|
| 166 |
+
|
| 167 |
+
- (MPSGraphTensor * _Nonnull) resizeBilinearWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
|
| 168 |
+
input:(MPSGraphTensor * _Nonnull) input
|
| 169 |
+
scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
|
| 170 |
+
layout:(MPSGraphTensorNamedDataLayout) layout
|
| 171 |
+
name:(NSString * _Nullable) name;
|
| 172 |
+
|
| 173 |
+
- (MPSGraphTensor * _Nonnull) sampleGridWithSourceTensor:(MPSGraphTensor * _Nonnull) source
|
| 174 |
+
coordinateTensor:(MPSGraphTensor * _Nonnull) coordinates
|
| 175 |
+
layout:(MPSGraphTensorNamedDataLayout) layout
|
| 176 |
+
normalizeCoordinates:(BOOL) normalizeCoordinates
|
| 177 |
+
relativeCoordinates:(BOOL) relativeCoordinates
|
| 178 |
+
alignCorners:(BOOL) alignCorners
|
| 179 |
+
paddingMode:(MPSGraphPaddingMode) paddingMode
|
| 180 |
+
samplingMode:(MPSGraphResizeMode) samplingMode
|
| 181 |
+
constantValue:(double) constantValue
|
| 182 |
+
name:(NSString * _Nullable) name;
|
| 183 |
+
|
| 184 |
+
- (MPSGraphTensor * _Nonnull) sampleGridWithSourceTensor:(MPSGraphTensor * _Nonnull) source
|
| 185 |
+
coordinateTensor:(MPSGraphTensor * _Nonnull) coordinates
|
| 186 |
+
layout:(MPSGraphTensorNamedDataLayout) layout
|
| 187 |
+
normalizeCoordinates:(BOOL) normalizeCoordinates
|
| 188 |
+
relativeCoordinates:(BOOL) relativeCoordinates
|
| 189 |
+
alignCorners:(BOOL) alignCorners
|
| 190 |
+
paddingMode:(MPSGraphPaddingMode) paddingMode
|
| 191 |
+
nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
|
| 192 |
+
constantValue:(double) constantValue
|
| 193 |
+
name:(NSString * _Nullable) name;
|
| 194 |
+
- (MPSGraphTensor * _Nonnull) truncateWithTensor:(MPSGraphTensor * _Nonnull) tensor
|
| 195 |
+
name:(NSString * _Nullable) name;
|
| 196 |
+
|
| 197 |
+
@end
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorFactories.h
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
namespace at {
|
| 4 |
+
namespace native {
|
| 5 |
+
|
| 6 |
+
} // namespace native
|
| 7 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorMath.h
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <ATen/core/ATen_fwd.h>
|
| 4 |
+
#include <ATen/NestedTensorImpl.h>
|
| 5 |
+
#include <c10/macros/Macros.h>
|
| 6 |
+
|
| 7 |
+
namespace at {
|
| 8 |
+
namespace native {
|
| 9 |
+
|
| 10 |
+
TORCH_API Tensor NestedTensor_to_padded_tensor_generic(
|
| 11 |
+
const Tensor& t,
|
| 12 |
+
double padding,
|
| 13 |
+
OptionalIntArrayRef output_size);
|
| 14 |
+
|
| 15 |
+
template <typename Func>
|
| 16 |
+
Tensor map_nt(const Tensor& nt, Func f) {
|
| 17 |
+
auto* nt_impl = get_nested_tensor_impl(nt);
|
| 18 |
+
const auto& sizes = nt_impl->get_nested_sizes();
|
| 19 |
+
return at::detail::make_tensor<NestedTensorImpl>(f(nt_impl->get_buffer()), sizes);
|
| 20 |
+
}
|
| 21 |
+
template <typename Func>
|
| 22 |
+
Tensor map_nt_binary(const Tensor& nt_1, const Tensor& nt_2, Func f){
|
| 23 |
+
auto* nt_impl_1 = get_nested_tensor_impl(nt_1);
|
| 24 |
+
auto* nt_impl_2 = get_nested_tensor_impl(nt_2);
|
| 25 |
+
const auto& sizes = nt_impl_1->get_nested_sizes();
|
| 26 |
+
return at::detail::make_tensor<NestedTensorImpl>(f(nt_impl_1->get_buffer(), nt_impl_2->get_buffer()), sizes);
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs(
|
| 30 |
+
const NestedTensorImpl& input,
|
| 31 |
+
IntArrayRef normalized_shape,
|
| 32 |
+
const Tensor& weight /* optional */,
|
| 33 |
+
const Tensor& bias /* optional */) {
|
| 34 |
+
|
| 35 |
+
const size_t normalized_ndim = normalized_shape.size();
|
| 36 |
+
TORCH_CHECK(
|
| 37 |
+
normalized_ndim >= 1,
|
| 38 |
+
"Expected normalized_shape to be at least 1-dimensional, i.e., ",
|
| 39 |
+
"containing at least one element, but got normalized_shape = ",
|
| 40 |
+
normalized_shape);
|
| 41 |
+
TORCH_CHECK(
|
| 42 |
+
!weight.defined() || weight.sizes().equals(normalized_shape),
|
| 43 |
+
"Expected weight to be of same shape as normalized_shape, but got ",
|
| 44 |
+
"weight of shape ",
|
| 45 |
+
weight.sizes(),
|
| 46 |
+
" and normalized_shape = ",
|
| 47 |
+
normalized_shape);
|
| 48 |
+
TORCH_CHECK(
|
| 49 |
+
!bias.defined() || bias.sizes().equals(normalized_shape),
|
| 50 |
+
"Expected bias to be of same shape as normalized_shape, but got ",
|
| 51 |
+
"bias of shape ",
|
| 52 |
+
bias.sizes(),
|
| 53 |
+
" and normalized_shape = ",
|
| 54 |
+
normalized_shape);
|
| 55 |
+
|
| 56 |
+
// Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input
|
| 57 |
+
// Also, compute M and N considering the idiosyncracies of NestedTensors
|
| 58 |
+
int64_t N = 1;
|
| 59 |
+
for (const auto i: c10::irange(normalized_ndim)) {
|
| 60 |
+
TORCH_CHECK(
|
| 61 |
+
input.opt_size(-normalized_ndim + i) != c10::nullopt,
|
| 62 |
+
"normalized_shape extends into irregular dimensions for the nested tensor"
|
| 63 |
+
);
|
| 64 |
+
TORCH_CHECK(
|
| 65 |
+
normalized_shape[i] == *input.opt_size(-normalized_ndim + i),
|
| 66 |
+
"The shape at dimension ",
|
| 67 |
+
i,
|
| 68 |
+
"of normalized_shape doesn't match the input"
|
| 69 |
+
);
|
| 70 |
+
N *= normalized_shape[i];
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
const int64_t M = input.numel() / N;
|
| 74 |
+
|
| 75 |
+
return std::make_pair(M, N);
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape);
|
| 79 |
+
|
| 80 |
+
} // namespace native
|
| 81 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerUtils.h
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include <ATen/ATen.h>
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
namespace at {
|
| 5 |
+
namespace native {
|
| 6 |
+
namespace preprocessing {
|
| 7 |
+
|
| 8 |
+
/**
|
| 9 |
+
* This function will take nested query, key, and value
|
| 10 |
+
* and will preprocess it in order to run with either
|
| 11 |
+
* the flash-attention or efficient-attention kernels.
|
| 12 |
+
* @return A tuple containing all the necessary data for running the fused
|
| 13 |
+
* kernels
|
| 14 |
+
*/
|
| 15 |
+
std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, int64_t, int64_t, Tensor>
|
| 16 |
+
sdpa_nested_preprocessing(
|
| 17 |
+
const Tensor& query,
|
| 18 |
+
const Tensor& key,
|
| 19 |
+
const Tensor& value);
|
| 20 |
+
|
| 21 |
+
/**
|
| 22 |
+
* This function will take nested query, key, and value, grad_out, and out
|
| 23 |
+
* and will preprocess it in order to run with either
|
| 24 |
+
* the flash-attention or efficient-attention kernels backwards.
|
| 25 |
+
* We use both functions to avoid having to do the same preprocessing
|
| 26 |
+
* for cumulative_sequence_length_q and cumulative_sequence_length_kv
|
| 27 |
+
* @return A tuple containing all the necessary data for running the fused
|
| 28 |
+
* kernels
|
| 29 |
+
*/
|
| 30 |
+
std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>
|
| 31 |
+
sdpa_nested_preprocessing_backward(
|
| 32 |
+
const at::Tensor& grad_out_,
|
| 33 |
+
const at::Tensor& query,
|
| 34 |
+
const at::Tensor& key,
|
| 35 |
+
const at::Tensor& value,
|
| 36 |
+
const at::Tensor& out,
|
| 37 |
+
const Tensor& cumulative_sequence_length_q,
|
| 38 |
+
const Tensor& cumulative_sequence_length_kv,
|
| 39 |
+
const int64_t max_seqlen_batch_q,
|
| 40 |
+
const int64_t max_seqlen_batch_kv);
|
| 41 |
+
|
| 42 |
+
} // namespace preprocessing
|
| 43 |
+
} // namespace native
|
| 44 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorUtils.h
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
#include <ATen/Dispatch.h>
|
| 4 |
+
#include <ATen/NestedTensorImpl.h>
|
| 5 |
+
#include <ATen/Parallel.h>
|
| 6 |
+
#include <ATen/core/Tensor.h>
|
| 7 |
+
#include <c10/core/DispatchKeySet.h>
|
| 8 |
+
#include <c10/core/TensorImpl.h>
|
| 9 |
+
#include <c10/macros/Macros.h>
|
| 10 |
+
#include <c10/util/Exception.h>
|
| 11 |
+
|
| 12 |
+
#ifndef AT_PER_OPERATOR_HEADERS
|
| 13 |
+
|
| 14 |
+
#include <ATen/Functions.h>
|
| 15 |
+
#include <ATen/NativeFunctions.h>
|
| 16 |
+
#else
|
| 17 |
+
#include <ATen/ops/cat.h>
|
| 18 |
+
#include <ATen/ops/empty.h>
|
| 19 |
+
#include <ATen/ops/ones_native.h>
|
| 20 |
+
#include <ATen/ops/prod.h>
|
| 21 |
+
#include <ATen/ops/stack_native.h>
|
| 22 |
+
#include <ATen/ops/tensor.h>
|
| 23 |
+
#endif
|
| 24 |
+
|
| 25 |
+
#include <utility>
|
| 26 |
+
#include <vector>
|
| 27 |
+
|
| 28 |
+
namespace at {
|
| 29 |
+
namespace native {
|
| 30 |
+
struct NestedTensorImpl;
|
| 31 |
+
|
| 32 |
+
// The following functions are used to construct nested tensors from buffers and
|
| 33 |
+
// metadata.
|
| 34 |
+
|
| 35 |
+
inline at::Tensor wrap_buffer(at::Tensor buffer, at::Tensor nested_sizes) {
|
| 36 |
+
TORCH_CHECK(
|
| 37 |
+
buffer.dim() == 1,
|
| 38 |
+
"Expected given buffer to be 1dim, but got ",
|
| 39 |
+
buffer.dim(),
|
| 40 |
+
" instead.");
|
| 41 |
+
TORCH_CHECK(
|
| 42 |
+
buffer.is_contiguous(), "Expected given buffer to be contiguous.");
|
| 43 |
+
return at::detail::make_tensor<NestedTensorImpl>(
|
| 44 |
+
std::move(buffer), std::move(nested_sizes));
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
// TODO: Figure out if we need a non-moving wrap_buffer()
|
| 48 |
+
inline at::Tensor wrap_buffer(
|
| 49 |
+
at::Tensor buffer,
|
| 50 |
+
at::Tensor nested_sizes,
|
| 51 |
+
at::Tensor nested_strides,
|
| 52 |
+
at::Tensor storage_offsets) {
|
| 53 |
+
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
| 54 |
+
buffer.is_contiguous(), "Given buffer must be contiguous.");
|
| 55 |
+
return at::detail::make_tensor<NestedTensorImpl>(
|
| 56 |
+
std::move(buffer),
|
| 57 |
+
std::move(nested_sizes),
|
| 58 |
+
std::move(nested_strides),
|
| 59 |
+
std::move(storage_offsets));
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
inline at::Tensor get_buffer(const at::Tensor& tensor) {
|
| 63 |
+
return get_nested_tensor_impl(tensor)->get_buffer();
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
/**
|
| 67 |
+
* Create a new nested tensor that is a view of a base nested tensor
|
| 68 |
+
*
|
| 69 |
+
* create_view_tensor calls a specialized constructor that copys the
|
| 70 |
+
* the keys from base onto the new view tensor being created.
|
| 71 |
+
* The storage is shared between the base and the returned view tensor
|
| 72 |
+
*
|
| 73 |
+
* All callers of this helper must:
|
| 74 |
+
* - Only return a view of the input
|
| 75 |
+
* - Must be explicit and define a derivative
|
| 76 |
+
*
|
| 77 |
+
* @param base Base tensor to construct view from.
|
| 78 |
+
* @param nested_sizes View tensors' sizes.
|
| 79 |
+
* @param nested_strides View tensors' strides.
|
| 80 |
+
* @param storage_offsets View tensors' offsets.
|
| 81 |
+
* @return A newly constructed view tensor
|
| 82 |
+
*/
|
| 83 |
+
inline at::Tensor create_nested_view_tensor(
|
| 84 |
+
const at::Tensor& base,
|
| 85 |
+
at::Tensor nested_sizes,
|
| 86 |
+
at::Tensor nested_strides,
|
| 87 |
+
at::Tensor storage_offsets) {
|
| 88 |
+
TORCH_INTERNAL_ASSERT(
|
| 89 |
+
base.is_nested(),
|
| 90 |
+
"This function can only be used to create nested tensor views");
|
| 91 |
+
TORCH_INTERNAL_ASSERT(
|
| 92 |
+
c10::impl::tls_local_dispatch_key_set().excluded_.has(
|
| 93 |
+
c10::DispatchKey::AutogradFunctionality),
|
| 94 |
+
"Creating a non differentiable nested tensor view in a CompositeImplicit function is not allowed.");
|
| 95 |
+
return at::detail::make_tensor<NestedTensorImpl>(
|
| 96 |
+
c10::TensorImpl::VIEW,
|
| 97 |
+
base,
|
| 98 |
+
nested_sizes,
|
| 99 |
+
nested_strides,
|
| 100 |
+
storage_offsets);
|
| 101 |
+
}
|
| 102 |
+
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
| 103 |
+
|
| 104 |
+
// Helper functions for getting information about a nested tensor's shape.
|
| 105 |
+
|
| 106 |
+
int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt);
|
| 107 |
+
|
| 108 |
+
// The sizes of the underlying tensors
|
| 109 |
+
inline std::vector<IntArrayRef> NestedTensor_get_sizes(
|
| 110 |
+
const NestedTensorImpl* self_ptr) {
|
| 111 |
+
int64_t ntensors = self_ptr->size(0);
|
| 112 |
+
std::vector<IntArrayRef> sizes(ntensors);
|
| 113 |
+
if (ntensors == 0) {
|
| 114 |
+
return sizes;
|
| 115 |
+
}
|
| 116 |
+
const Tensor& sizemat = self_ptr->get_nested_sizes();
|
| 117 |
+
int64_t orig_dim = sizemat.size(1);
|
| 118 |
+
// nesting scalars has empty sizes
|
| 119 |
+
if (orig_dim == 0) {
|
| 120 |
+
return sizes;
|
| 121 |
+
}
|
| 122 |
+
const int64_t* sizemat_ptr = sizemat.data_ptr<int64_t>();
|
| 123 |
+
|
| 124 |
+
for (const auto i : c10::irange(ntensors)) {
|
| 125 |
+
sizes[i] = IntArrayRef(sizemat_ptr, sizemat_ptr + orig_dim);
|
| 126 |
+
sizemat_ptr += orig_dim;
|
| 127 |
+
}
|
| 128 |
+
return sizes;
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
TORCH_API std::vector<int64_t> NestedTensor_get_max_size(
|
| 132 |
+
const NestedTensorImpl& nt);
|
| 133 |
+
|
| 134 |
+
std::vector<int64_t> NestedTensor_get_max_size_from_size_tensor(
|
| 135 |
+
const Tensor& sizes);
|
| 136 |
+
|
| 137 |
+
inline std::vector<IntArrayRef> NestedTensor_get_sizes(const at::Tensor& self) {
|
| 138 |
+
const NestedTensorImpl* self_ptr = get_nested_tensor_impl(self);
|
| 139 |
+
return NestedTensor_get_sizes(self_ptr);
|
| 140 |
+
}
|
| 141 |
+
// The strides of the underlying tensors
|
| 142 |
+
inline std::vector<IntArrayRef> NestedTensor_get_strides(
|
| 143 |
+
const NestedTensorImpl* self_ptr) {
|
| 144 |
+
int64_t ntensors = self_ptr->size(0);
|
| 145 |
+
std::vector<IntArrayRef> strides(ntensors);
|
| 146 |
+
if (ntensors == 0) {
|
| 147 |
+
return strides;
|
| 148 |
+
}
|
| 149 |
+
const Tensor& stridemat = self_ptr->get_nested_strides();
|
| 150 |
+
int64_t orig_dim = stridemat.size(1);
|
| 151 |
+
// nesting scalars has empty strides
|
| 152 |
+
if (orig_dim == 0) {
|
| 153 |
+
return strides;
|
| 154 |
+
}
|
| 155 |
+
const int64_t* stridemat_ptr = stridemat.data_ptr<int64_t>();
|
| 156 |
+
for (const auto i : c10::irange(ntensors)) {
|
| 157 |
+
strides[i] = IntArrayRef(stridemat_ptr, stridemat_ptr + orig_dim);
|
| 158 |
+
stridemat_ptr += orig_dim;
|
| 159 |
+
}
|
| 160 |
+
return strides;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
inline std::vector<IntArrayRef> NestedTensor_get_strides(
|
| 164 |
+
const at::Tensor& self) {
|
| 165 |
+
const NestedTensorImpl* self_ptr = get_nested_tensor_impl(self);
|
| 166 |
+
return NestedTensor_get_strides(self_ptr);
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
inline void check_numel_equals_buffer_size(const at::Tensor& self) {
|
| 170 |
+
auto self_impl = get_nested_tensor_impl(self);
|
| 171 |
+
TORCH_CHECK(
|
| 172 |
+
self.numel() == static_cast<int64_t>(self_impl->get_buffer_size()),
|
| 173 |
+
"Number of elements in nested tensor must match number of elements in buffer.");
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
inline void check_numel_equals_buffer_size(const NestedTensorImpl* self_ptr) {
|
| 177 |
+
TORCH_CHECK(
|
| 178 |
+
self_ptr->numel() == static_cast<int64_t>(self_ptr->get_buffer_size()),
|
| 179 |
+
"Number of elements in nested tensor must match number of elements in buffer.");
|
| 180 |
+
}
|
| 181 |
+
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
| 182 |
+
// Data structures and functions for generically applying a function on a nested
|
| 183 |
+
// tensor.
|
| 184 |
+
namespace impl {
|
| 185 |
+
|
| 186 |
+
template <typename T>
|
| 187 |
+
struct NestedNode {
|
| 188 |
+
NestedNode() = delete;
|
| 189 |
+
explicit NestedNode(std::vector<T>&& children)
|
| 190 |
+
: _is_leaf(false), _children(children) {}
|
| 191 |
+
explicit NestedNode(TensorList children)
|
| 192 |
+
: _is_leaf(false), _children(children.vec()) {}
|
| 193 |
+
// NestedNode(NestedNode&) = delete;
|
| 194 |
+
// NestedNode(const NestedNode&) = delete;
|
| 195 |
+
// NestedNode& operator=(NestedNode) = delete;
|
| 196 |
+
explicit NestedNode(T payload) : _is_leaf(true), _payload(std::move(payload)) {}
|
| 197 |
+
inline bool is_leaf() const {
|
| 198 |
+
return _is_leaf;
|
| 199 |
+
}
|
| 200 |
+
inline size_t degree() const {
|
| 201 |
+
return _children.size();
|
| 202 |
+
}
|
| 203 |
+
inline const std::vector<T> unbind() const {
|
| 204 |
+
return _children;
|
| 205 |
+
}
|
| 206 |
+
inline T children(size_t i) const {
|
| 207 |
+
return _children[i];
|
| 208 |
+
}
|
| 209 |
+
inline const T& payload() const {
|
| 210 |
+
return _payload;
|
| 211 |
+
}
|
| 212 |
+
inline T& payload() {
|
| 213 |
+
return _payload;
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
private:
|
| 217 |
+
bool _is_leaf;
|
| 218 |
+
std::vector<T> _children;
|
| 219 |
+
T _payload;
|
| 220 |
+
};
|
| 221 |
+
|
| 222 |
+
using TensorNode = NestedNode<at::Tensor>;
|
| 223 |
+
|
| 224 |
+
template <class F, class A, class TypeList>
|
| 225 |
+
class _map;
|
| 226 |
+
|
| 227 |
+
template <class F, class A, class... Args>
|
| 228 |
+
class _map<F, A, c10::guts::typelist::typelist<Args...>> {
|
| 229 |
+
public:
|
| 230 |
+
static A function_one(F&& fn, const Args&... nested_node) {
|
| 231 |
+
return std::forward<F>(fn)(nested_node...);
|
| 232 |
+
}
|
| 233 |
+
// NOTE: We must move F to avoid copying objects if it is a lambda with
|
| 234 |
+
// captures.
|
| 235 |
+
static NestedNode<A> function(
|
| 236 |
+
F&& fn,
|
| 237 |
+
const NestedNode<Args>&... nested_node) {
|
| 238 |
+
size_t degree = 0;
|
| 239 |
+
bool all_leaf = true;
|
| 240 |
+
c10::guts::tuple_map(
|
| 241 |
+
std::forward_as_tuple(nested_node...), [&all_leaf, °ree](auto n) {
|
| 242 |
+
all_leaf = all_leaf && (n.is_leaf());
|
| 243 |
+
if (degree > 1 && n.degree() > 1) {
|
| 244 |
+
TORCH_CHECK(
|
| 245 |
+
degree == n.degree(), "NestedNodes must match in degree.");
|
| 246 |
+
}
|
| 247 |
+
if (n.degree() > degree) {
|
| 248 |
+
degree = n.degree();
|
| 249 |
+
}
|
| 250 |
+
return nullptr;
|
| 251 |
+
});
|
| 252 |
+
// All NestedNodes just wrap regular objects.
|
| 253 |
+
if (all_leaf) {
|
| 254 |
+
return NestedNode<A>(std::forward<F>(fn)(nested_node.payload()...));
|
| 255 |
+
}
|
| 256 |
+
// Some NestedNodes wrap regular Tensors, some NestedTensors and some other
|
| 257 |
+
// types.
|
| 258 |
+
std::vector<A> result;
|
| 259 |
+
for (size_t i = 0; i < degree; i++) {
|
| 260 |
+
std::tuple<Args...> children = c10::guts::tuple_map(
|
| 261 |
+
std::forward_as_tuple(nested_node...), [&i](auto a) {
|
| 262 |
+
static_assert(
|
| 263 |
+
c10::guts::is_instantiation_of<NestedNode, decltype(a)>::value,
|
| 264 |
+
"Internal error.");
|
| 265 |
+
// Broadcast regular arguments across NestedTensor constituents.
|
| 266 |
+
// This could be a Tensor, integer or anything else really.
|
| 267 |
+
if (a.is_leaf()) {
|
| 268 |
+
return a.payload();
|
| 269 |
+
}
|
| 270 |
+
// Broadcast NestedTensors with one constituent.
|
| 271 |
+
if (a.degree() == 1 && !a.is_leaf()) {
|
| 272 |
+
return a.children(0);
|
| 273 |
+
}
|
| 274 |
+
TORCH_CHECK(a.degree() > 0, "Internal assert.");
|
| 275 |
+
return a.children(i);
|
| 276 |
+
});
|
| 277 |
+
c10::guts::apply(
|
| 278 |
+
[&result, &fn](Args... filtered) {
|
| 279 |
+
result.emplace_back(function_one(std::forward<F>(fn), filtered...));
|
| 280 |
+
},
|
| 281 |
+
std::move(children));
|
| 282 |
+
}
|
| 283 |
+
return NestedNode<A>(std::move(result));
|
| 284 |
+
}
|
| 285 |
+
};
|
| 286 |
+
|
| 287 |
+
// TODO: Add static assert to verify lambda arguments match nested_node types
|
| 288 |
+
template <class F, class... B>
|
| 289 |
+
static inline NestedNode<
|
| 290 |
+
typename c10::guts::infer_function_traits<F>::type::return_type>
|
| 291 |
+
map(F&& fn, const NestedNode<B>&... nested_node) {
|
| 292 |
+
return _map<
|
| 293 |
+
F,
|
| 294 |
+
typename c10::guts::infer_function_traits<F>::type::return_type,
|
| 295 |
+
typename c10::guts::infer_function_traits<F>::type::parameter_types>::
|
| 296 |
+
function(std::forward<F>(fn), nested_node...);
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
inline TensorNode get_nested_tensor_structure(at::Tensor tensor) {
|
| 300 |
+
if (get_nested_tensor_impl_or_null(tensor) == nullptr) {
|
| 301 |
+
return TensorNode(std::move(tensor));
|
| 302 |
+
}
|
| 303 |
+
return TensorNode(tensor.unbind());
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
inline Tensor wrap_tensor_node(
|
| 307 |
+
TensorNode tensor_node,
|
| 308 |
+
c10::optional<ScalarType> dtype,
|
| 309 |
+
c10::optional<Layout> layout,
|
| 310 |
+
c10::optional<Device> device,
|
| 311 |
+
c10::optional<bool> pin_memory) {
|
| 312 |
+
TORCH_CHECK(
|
| 313 |
+
!tensor_node.is_leaf(), "Expected TensorNode to wrap a list of Tensors.");
|
| 314 |
+
TensorOptions options_ =
|
| 315 |
+
TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(
|
| 316 |
+
pin_memory);
|
| 317 |
+
if (tensor_node.degree() == 0) {
|
| 318 |
+
return wrap_buffer(ones({0}, dtype, layout, device), ones({}));
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
// Fast path: if all tensors are on CPU, have contiguous memory, and the same
|
| 322 |
+
// dtype, copying can be done much faster.
|
| 323 |
+
bool all_tensors_cpu = true;
|
| 324 |
+
bool all_tensors_contiguous = true;
|
| 325 |
+
bool all_tensors_same_dtype = true;
|
| 326 |
+
auto first_dtype = tensor_node.children(0).dtype();
|
| 327 |
+
std::vector<long> start_offsets(tensor_node.degree());
|
| 328 |
+
start_offsets[0] = 0;
|
| 329 |
+
long total_size = 0;
|
| 330 |
+
for (const auto i : c10::irange(tensor_node.degree())) {
|
| 331 |
+
all_tensors_cpu = all_tensors_cpu && tensor_node.children(i).is_cpu();
|
| 332 |
+
all_tensors_contiguous =
|
| 333 |
+
all_tensors_contiguous && tensor_node.children(i).is_contiguous();
|
| 334 |
+
all_tensors_same_dtype = all_tensors_same_dtype &&
|
| 335 |
+
(first_dtype == tensor_node.children(i).dtype());
|
| 336 |
+
if (!(all_tensors_cpu && all_tensors_contiguous &&
|
| 337 |
+
all_tensors_same_dtype)) {
|
| 338 |
+
break;
|
| 339 |
+
}
|
| 340 |
+
if (i > 0) {
|
| 341 |
+
start_offsets[i] =
|
| 342 |
+
start_offsets[i - 1] + tensor_node.children(i - 1).numel();
|
| 343 |
+
}
|
| 344 |
+
total_size += tensor_node.children(i).numel();
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
TensorOptions options;
|
| 348 |
+
Tensor nt_buffer, nt_sizes;
|
| 349 |
+
if (all_tensors_cpu && all_tensors_contiguous && all_tensors_same_dtype) {
|
| 350 |
+
nt_buffer = at::empty({total_size}, tensor_node.children(0).options());
|
| 351 |
+
nt_sizes = at::empty(
|
| 352 |
+
{static_cast<long>(tensor_node.degree()),
|
| 353 |
+
static_cast<long>(tensor_node.children(0).sizes().size())},
|
| 354 |
+
TensorOptions().dtype(kLong));
|
| 355 |
+
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
|
| 356 |
+
at::ScalarType::Half,
|
| 357 |
+
at::ScalarType::Bool,
|
| 358 |
+
at::ScalarType::BFloat16,
|
| 359 |
+
c10::typeMetaToScalarType(first_dtype),
|
| 360 |
+
"create_nt_buffer",
|
| 361 |
+
[&]() {
|
| 362 |
+
at::parallel_for(
|
| 363 |
+
0, tensor_node.degree(), 1, [&](int64_t begin, int64_t end) {
|
| 364 |
+
for (int64_t i = begin; i < end; ++i) {
|
| 365 |
+
// Only try copying memory if there is more than 0 elements
|
| 366 |
+
// for a certain tensor
|
| 367 |
+
if (tensor_node.children(i).numel() > 0) {
|
| 368 |
+
memcpy(
|
| 369 |
+
nt_buffer.mutable_data_ptr<scalar_t>() + start_offsets[i],
|
| 370 |
+
tensor_node.children(i).const_data_ptr<scalar_t>(),
|
| 371 |
+
tensor_node.children(i).numel() * sizeof(scalar_t));
|
| 372 |
+
}
|
| 373 |
+
}
|
| 374 |
+
});
|
| 375 |
+
});
|
| 376 |
+
long sizes_offset = 0;
|
| 377 |
+
for (size_t i = 0; i < tensor_node.degree(); ++i) {
|
| 378 |
+
auto tensor_sizes = tensor_node.children(i).sizes();
|
| 379 |
+
for (int64_t tensor_size : tensor_sizes) {
|
| 380 |
+
nt_sizes.mutable_data_ptr<int64_t>()[sizes_offset++] = tensor_size;
|
| 381 |
+
}
|
| 382 |
+
}
|
| 383 |
+
options = nt_buffer.options().merge_in(options_);
|
| 384 |
+
} else { // Slow path
|
| 385 |
+
std::vector<Tensor> flat_tensors;
|
| 386 |
+
std::vector<Tensor> sizes;
|
| 387 |
+
for (const auto i : c10::irange(tensor_node.degree())) {
|
| 388 |
+
flat_tensors.push_back(tensor_node.children(i).reshape(-1).contiguous());
|
| 389 |
+
sizes.push_back(
|
| 390 |
+
tensor(c10::IntArrayRef(tensor_node.children(i).sizes())));
|
| 391 |
+
}
|
| 392 |
+
options = flat_tensors[0].options().merge_in(options_);
|
| 393 |
+
nt_buffer = at::cat(flat_tensors);
|
| 394 |
+
nt_sizes = at::native::stack(sizes);
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
return wrap_buffer(nt_buffer.to(options), nt_sizes);
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
} // namespace impl
|
| 401 |
+
|
| 402 |
+
// This function is meant to ease rapid operator coverage for
|
| 403 |
+
// NestedTensor kernels. It is not meant to be efficient. Use it judiciously.
|
| 404 |
+
template <class F, class... A>
|
| 405 |
+
inline at::Tensor map_nested_tensor(F&& fn, A... a) {
|
| 406 |
+
return wrap_tensor_node(
|
| 407 |
+
impl::map(std::forward<F>(fn), impl::get_nested_tensor_structure(a)...),
|
| 408 |
+
c10::nullopt,
|
| 409 |
+
c10::nullopt,
|
| 410 |
+
c10::nullopt,
|
| 411 |
+
c10::nullopt);
|
| 412 |
+
}
|
| 413 |
+
|
| 414 |
+
} // namespace native
|
| 415 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_add_relu_meta_dispatch.h
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace meta {
|
| 19 |
+
|
| 20 |
+
TORCH_API at::Tensor & _add_relu_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1);
|
| 21 |
+
TORCH_API at::Tensor & _add_relu_(at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1);
|
| 22 |
+
|
| 23 |
+
} // namespace meta
|
| 24 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cslt_compress_cuda_dispatch.h
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace cuda {
|
| 19 |
+
|
| 20 |
+
TORCH_API at::Tensor _cslt_compress(const at::Tensor & input);
|
| 21 |
+
|
| 22 |
+
} // namespace cuda
|
| 23 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fft_c2r.h
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Function.h
|
| 4 |
+
|
| 5 |
+
#include <ATen/Context.h>
|
| 6 |
+
#include <ATen/DeviceGuard.h>
|
| 7 |
+
#include <ATen/TensorUtils.h>
|
| 8 |
+
#include <ATen/TracerMode.h>
|
| 9 |
+
#include <ATen/core/Generator.h>
|
| 10 |
+
#include <ATen/core/Reduction.h>
|
| 11 |
+
#include <ATen/core/Tensor.h>
|
| 12 |
+
#include <c10/core/Scalar.h>
|
| 13 |
+
#include <c10/core/Storage.h>
|
| 14 |
+
#include <c10/core/TensorOptions.h>
|
| 15 |
+
#include <c10/util/Deprecated.h>
|
| 16 |
+
#include <c10/util/Optional.h>
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
#include <ATen/ops/_fft_c2r_ops.h>
|
| 21 |
+
|
| 22 |
+
namespace at {
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
// aten::_fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
|
| 26 |
+
inline at::Tensor _fft_c2r(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
|
| 27 |
+
return at::_ops::_fft_c2r::call(self, dim, normalization, last_dim_size);
|
| 28 |
+
}
|
| 29 |
+
namespace symint {
|
| 30 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
|
| 31 |
+
at::Tensor _fft_c2r(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
|
| 32 |
+
return at::_ops::_fft_c2r::call(self, dim, normalization, last_dim_size);
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
// aten::_fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
|
| 37 |
+
inline at::Tensor _fft_c2r_symint(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size) {
|
| 38 |
+
return at::_ops::_fft_c2r::call(self, dim, normalization, last_dim_size);
|
| 39 |
+
}
|
| 40 |
+
namespace symint {
|
| 41 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
|
| 42 |
+
at::Tensor _fft_c2r(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size) {
|
| 43 |
+
return at::_ops::_fft_c2r::call(self, dim, normalization, last_dim_size);
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
// aten::_fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
|
| 48 |
+
inline at::Tensor & _fft_c2r_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
|
| 49 |
+
return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
|
| 50 |
+
}
|
| 51 |
+
namespace symint {
|
| 52 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
|
| 53 |
+
at::Tensor & _fft_c2r_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
|
| 54 |
+
return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
// aten::_fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
|
| 59 |
+
inline at::Tensor & _fft_c2r_outf(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size, at::Tensor & out) {
|
| 60 |
+
return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
|
| 61 |
+
}
|
| 62 |
+
namespace symint {
|
| 63 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
|
| 64 |
+
at::Tensor & _fft_c2r_outf(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size, at::Tensor & out) {
|
| 65 |
+
return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
// aten::_fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
|
| 70 |
+
inline at::Tensor & _fft_c2r_symint_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size) {
|
| 71 |
+
return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
|
| 72 |
+
}
|
| 73 |
+
namespace symint {
|
| 74 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
|
| 75 |
+
at::Tensor & _fft_c2r_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size) {
|
| 76 |
+
return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
// aten::_fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
|
| 81 |
+
inline at::Tensor & _fft_c2r_symint_outf(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size, at::Tensor & out) {
|
| 82 |
+
return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
|
| 83 |
+
}
|
| 84 |
+
namespace symint {
|
| 85 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
|
| 86 |
+
at::Tensor & _fft_c2r_outf(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size, at::Tensor & out) {
|
| 87 |
+
return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
|
| 88 |
+
}
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
}
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_addcdiv_compositeexplicitautograd_dispatch.h
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace compositeexplicitautograd {
|
| 19 |
+
|
| 20 |
+
TORCH_API void _foreach_addcdiv_out(at::TensorList out, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1);
|
| 21 |
+
TORCH_API void _foreach_addcdiv_outf(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value, at::TensorList out);
|
| 22 |
+
TORCH_API void _foreach_addcdiv_out(at::TensorList out, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars);
|
| 23 |
+
TORCH_API void _foreach_addcdiv_outf(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars, at::TensorList out);
|
| 24 |
+
TORCH_API void _foreach_addcdiv_out(at::TensorList out, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars);
|
| 25 |
+
TORCH_API void _foreach_addcdiv_outf(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars, at::TensorList out);
|
| 26 |
+
|
| 27 |
+
} // namespace compositeexplicitautograd
|
| 28 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_cosh_ops.h
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Operator.h
|
| 4 |
+
|
| 5 |
+
#include <tuple>
|
| 6 |
+
#include <vector>
|
| 7 |
+
|
| 8 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 9 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 10 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 11 |
+
#include <ATen/core/ATen_fwd.h>
|
| 12 |
+
|
| 13 |
+
namespace at {
|
| 14 |
+
namespace _ops {
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
struct TORCH_API _foreach_cosh {
|
| 18 |
+
using schema = ::std::vector<at::Tensor> (at::TensorList);
|
| 19 |
+
using ptr_schema = schema*;
|
| 20 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 21 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_cosh")
|
| 22 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
|
| 23 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_cosh(Tensor[] self) -> Tensor[]")
|
| 24 |
+
static ::std::vector<at::Tensor> call(at::TensorList self);
|
| 25 |
+
static ::std::vector<at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self);
|
| 26 |
+
};
|
| 27 |
+
|
| 28 |
+
struct TORCH_API _foreach_cosh_ {
|
| 29 |
+
using schema = void (at::TensorList);
|
| 30 |
+
using ptr_schema = schema*;
|
| 31 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 32 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_cosh_")
|
| 33 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
|
| 34 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_cosh_(Tensor(a!)[] self) -> ()")
|
| 35 |
+
static void call(at::TensorList self);
|
| 36 |
+
static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self);
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
struct TORCH_API _foreach_cosh_out {
|
| 40 |
+
using schema = void (at::TensorList, at::TensorList);
|
| 41 |
+
using ptr_schema = schema*;
|
| 42 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 43 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_cosh")
|
| 44 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
|
| 45 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_cosh.out(Tensor[] self, *, Tensor(a!)[] out) -> ()")
|
| 46 |
+
static void call(at::TensorList self, at::TensorList out);
|
| 47 |
+
static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out);
|
| 48 |
+
};
|
| 49 |
+
|
| 50 |
+
}} // namespace at::_ops
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_erf.h
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Function.h
|
| 4 |
+
|
| 5 |
+
#include <ATen/Context.h>
|
| 6 |
+
#include <ATen/DeviceGuard.h>
|
| 7 |
+
#include <ATen/TensorUtils.h>
|
| 8 |
+
#include <ATen/TracerMode.h>
|
| 9 |
+
#include <ATen/core/Generator.h>
|
| 10 |
+
#include <ATen/core/Reduction.h>
|
| 11 |
+
#include <ATen/core/Tensor.h>
|
| 12 |
+
#include <c10/core/Scalar.h>
|
| 13 |
+
#include <c10/core/Storage.h>
|
| 14 |
+
#include <c10/core/TensorOptions.h>
|
| 15 |
+
#include <c10/util/Deprecated.h>
|
| 16 |
+
#include <c10/util/Optional.h>
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
#include <ATen/ops/_foreach_erf_ops.h>
|
| 21 |
+
|
| 22 |
+
namespace at {
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
// aten::_foreach_erf(Tensor[] self) -> Tensor[]
|
| 26 |
+
inline ::std::vector<at::Tensor> _foreach_erf(at::TensorList self) {
|
| 27 |
+
return at::_ops::_foreach_erf::call(self);
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
// aten::_foreach_erf_(Tensor(a!)[] self) -> ()
|
| 31 |
+
inline void _foreach_erf_(at::TensorList self) {
|
| 32 |
+
return at::_ops::_foreach_erf_::call(self);
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
// aten::_foreach_erf.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
|
| 36 |
+
inline void _foreach_erf_out(at::TensorList out, at::TensorList self) {
|
| 37 |
+
return at::_ops::_foreach_erf_out::call(self, out);
|
| 38 |
+
}
|
| 39 |
+
// aten::_foreach_erf.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
|
| 40 |
+
inline void _foreach_erf_outf(at::TensorList self, at::TensorList out) {
|
| 41 |
+
return at::_ops::_foreach_erf_out::call(self, out);
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
}
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_minimum.h
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Function.h
|
| 4 |
+
|
| 5 |
+
#include <ATen/Context.h>
|
| 6 |
+
#include <ATen/DeviceGuard.h>
|
| 7 |
+
#include <ATen/TensorUtils.h>
|
| 8 |
+
#include <ATen/TracerMode.h>
|
| 9 |
+
#include <ATen/core/Generator.h>
|
| 10 |
+
#include <ATen/core/Reduction.h>
|
| 11 |
+
#include <ATen/core/Tensor.h>
|
| 12 |
+
#include <c10/core/Scalar.h>
|
| 13 |
+
#include <c10/core/Storage.h>
|
| 14 |
+
#include <c10/core/TensorOptions.h>
|
| 15 |
+
#include <c10/util/Deprecated.h>
|
| 16 |
+
#include <c10/util/Optional.h>
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
#include <ATen/ops/_foreach_minimum_ops.h>
|
| 21 |
+
|
| 22 |
+
namespace at {
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
// aten::_foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
|
| 26 |
+
inline ::std::vector<at::Tensor> _foreach_minimum(at::TensorList self, const at::Scalar & scalar) {
|
| 27 |
+
return at::_ops::_foreach_minimum_Scalar::call(self, scalar);
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
// aten::_foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
|
| 31 |
+
inline void _foreach_minimum_(at::TensorList self, const at::Scalar & scalar) {
|
| 32 |
+
return at::_ops::_foreach_minimum__Scalar::call(self, scalar);
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
// aten::_foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
|
| 36 |
+
inline ::std::vector<at::Tensor> _foreach_minimum(at::TensorList self, at::TensorList other) {
|
| 37 |
+
return at::_ops::_foreach_minimum_List::call(self, other);
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
// aten::_foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
|
| 41 |
+
inline void _foreach_minimum_(at::TensorList self, at::TensorList other) {
|
| 42 |
+
return at::_ops::_foreach_minimum__List::call(self, other);
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
// aten::_foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
|
| 46 |
+
inline ::std::vector<at::Tensor> _foreach_minimum(at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
|
| 47 |
+
return at::_ops::_foreach_minimum_ScalarList::call(self, scalars);
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
// aten::_foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
|
| 51 |
+
inline void _foreach_minimum_(at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
|
| 52 |
+
return at::_ops::_foreach_minimum__ScalarList::call(self, scalars);
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
// aten::_foreach_minimum.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
|
| 56 |
+
inline void _foreach_minimum_out(at::TensorList out, at::TensorList self, const at::Scalar & scalar) {
|
| 57 |
+
return at::_ops::_foreach_minimum_Scalar_out::call(self, scalar, out);
|
| 58 |
+
}
|
| 59 |
+
// aten::_foreach_minimum.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
|
| 60 |
+
inline void _foreach_minimum_outf(at::TensorList self, const at::Scalar & scalar, at::TensorList out) {
|
| 61 |
+
return at::_ops::_foreach_minimum_Scalar_out::call(self, scalar, out);
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
// aten::_foreach_minimum.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
|
| 65 |
+
inline void _foreach_minimum_out(at::TensorList out, at::TensorList self, at::TensorList other) {
|
| 66 |
+
return at::_ops::_foreach_minimum_List_out::call(self, other, out);
|
| 67 |
+
}
|
| 68 |
+
// aten::_foreach_minimum.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
|
| 69 |
+
inline void _foreach_minimum_outf(at::TensorList self, at::TensorList other, at::TensorList out) {
|
| 70 |
+
return at::_ops::_foreach_minimum_List_out::call(self, other, out);
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
// aten::_foreach_minimum.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
|
| 74 |
+
inline void _foreach_minimum_out(at::TensorList out, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
|
| 75 |
+
return at::_ops::_foreach_minimum_ScalarList_out::call(self, scalars, out);
|
| 76 |
+
}
|
| 77 |
+
// aten::_foreach_minimum.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
|
| 78 |
+
inline void _foreach_minimum_outf(at::TensorList self, at::ArrayRef<at::Scalar> scalars, at::TensorList out) {
|
| 79 |
+
return at::_ops::_foreach_minimum_ScalarList_out::call(self, scalars, out);
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
}
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_trunc.h
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Function.h
|
| 4 |
+
|
| 5 |
+
#include <ATen/Context.h>
|
| 6 |
+
#include <ATen/DeviceGuard.h>
|
| 7 |
+
#include <ATen/TensorUtils.h>
|
| 8 |
+
#include <ATen/TracerMode.h>
|
| 9 |
+
#include <ATen/core/Generator.h>
|
| 10 |
+
#include <ATen/core/Reduction.h>
|
| 11 |
+
#include <ATen/core/Tensor.h>
|
| 12 |
+
#include <c10/core/Scalar.h>
|
| 13 |
+
#include <c10/core/Storage.h>
|
| 14 |
+
#include <c10/core/TensorOptions.h>
|
| 15 |
+
#include <c10/util/Deprecated.h>
|
| 16 |
+
#include <c10/util/Optional.h>
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
#include <ATen/ops/_foreach_trunc_ops.h>
|
| 21 |
+
|
| 22 |
+
namespace at {
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
// aten::_foreach_trunc(Tensor[] self) -> Tensor[]
|
| 26 |
+
inline ::std::vector<at::Tensor> _foreach_trunc(at::TensorList self) {
|
| 27 |
+
return at::_ops::_foreach_trunc::call(self);
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
// aten::_foreach_trunc_(Tensor(a!)[] self) -> ()
|
| 31 |
+
inline void _foreach_trunc_(at::TensorList self) {
|
| 32 |
+
return at::_ops::_foreach_trunc_::call(self);
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
// aten::_foreach_trunc.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
|
| 36 |
+
inline void _foreach_trunc_out(at::TensorList out, at::TensorList self) {
|
| 37 |
+
return at::_ops::_foreach_trunc_out::call(self, out);
|
| 38 |
+
}
|
| 39 |
+
// aten::_foreach_trunc.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
|
| 40 |
+
inline void _foreach_trunc_outf(at::TensorList self, at::TensorList out) {
|
| 41 |
+
return at::_ops::_foreach_trunc_out::call(self, out);
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
}
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fw_primal.h
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Function.h
|
| 4 |
+
|
| 5 |
+
#include <ATen/Context.h>
|
| 6 |
+
#include <ATen/DeviceGuard.h>
|
| 7 |
+
#include <ATen/TensorUtils.h>
|
| 8 |
+
#include <ATen/TracerMode.h>
|
| 9 |
+
#include <ATen/core/Generator.h>
|
| 10 |
+
#include <ATen/core/Reduction.h>
|
| 11 |
+
#include <ATen/core/Tensor.h>
|
| 12 |
+
#include <c10/core/Scalar.h>
|
| 13 |
+
#include <c10/core/Storage.h>
|
| 14 |
+
#include <c10/core/TensorOptions.h>
|
| 15 |
+
#include <c10/util/Deprecated.h>
|
| 16 |
+
#include <c10/util/Optional.h>
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
#include <ATen/ops/_fw_primal_ops.h>
|
| 21 |
+
|
| 22 |
+
namespace at {
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
}
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_int_mm_cuda_dispatch.h
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace cuda {
|
| 19 |
+
|
| 20 |
+
TORCH_API at::Tensor _int_mm(const at::Tensor & self, const at::Tensor & mat2);
|
| 21 |
+
TORCH_API at::Tensor & _int_mm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mat2);
|
| 22 |
+
TORCH_API at::Tensor & _int_mm_outf(const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out);
|
| 23 |
+
|
| 24 |
+
} // namespace cuda
|
| 25 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_compositeexplicitautograd_dispatch.h
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace compositeexplicitautograd {
|
| 19 |
+
|
| 20 |
+
TORCH_API at::Tensor & _masked_softmax_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, c10::optional<int64_t> dim=c10::nullopt, c10::optional<int64_t> mask_type=c10::nullopt);
|
| 21 |
+
TORCH_API at::Tensor & _masked_softmax_outf(const at::Tensor & self, const at::Tensor & mask, c10::optional<int64_t> dim, c10::optional<int64_t> mask_type, at::Tensor & out);
|
| 22 |
+
|
| 23 |
+
} // namespace compositeexplicitautograd
|
| 24 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_view_from_buffer_cpu_dispatch.h
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace cpu {
|
| 19 |
+
|
| 20 |
+
TORCH_API at::Tensor _nested_view_from_buffer(const at::Tensor & self, const at::Tensor & nested_size, const at::Tensor & nested_strides, const at::Tensor & offsets);
|
| 21 |
+
|
| 22 |
+
} // namespace cpu
|
| 23 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_remove_batch_dim.h
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Function.h
|
| 4 |
+
|
| 5 |
+
#include <ATen/Context.h>
|
| 6 |
+
#include <ATen/DeviceGuard.h>
|
| 7 |
+
#include <ATen/TensorUtils.h>
|
| 8 |
+
#include <ATen/TracerMode.h>
|
| 9 |
+
#include <ATen/core/Generator.h>
|
| 10 |
+
#include <ATen/core/Reduction.h>
|
| 11 |
+
#include <ATen/core/Tensor.h>
|
| 12 |
+
#include <c10/core/Scalar.h>
|
| 13 |
+
#include <c10/core/Storage.h>
|
| 14 |
+
#include <c10/core/TensorOptions.h>
|
| 15 |
+
#include <c10/util/Deprecated.h>
|
| 16 |
+
#include <c10/util/Optional.h>
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
#include <ATen/ops/_remove_batch_dim_ops.h>
|
| 21 |
+
|
| 22 |
+
namespace at {
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
// aten::_remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor
|
| 26 |
+
inline at::Tensor _remove_batch_dim(const at::Tensor & self, int64_t level, int64_t batch_size, int64_t out_dim) {
|
| 27 |
+
return at::_ops::_remove_batch_dim::call(self, level, batch_size, out_dim);
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
}
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_segment_reduce_backward_cpu_dispatch.h
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace cpu {
|
| 19 |
+
|
| 20 |
+
TORCH_API at::Tensor _segment_reduce_backward(const at::Tensor & grad, const at::Tensor & output, const at::Tensor & data, c10::string_view reduce, const c10::optional<at::Tensor> & lengths={}, const c10::optional<at::Tensor> & offsets={}, int64_t axis=0, const c10::optional<at::Scalar> & initial=c10::nullopt);
|
| 21 |
+
|
| 22 |
+
} // namespace cpu
|
| 23 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_bsc_tensor_unsafe_native.h
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from NativeFunction.h
|
| 4 |
+
|
| 5 |
+
#include <c10/core/Scalar.h>
|
| 6 |
+
#include <c10/core/Storage.h>
|
| 7 |
+
#include <c10/core/TensorOptions.h>
|
| 8 |
+
#include <c10/util/Deprecated.h>
|
| 9 |
+
#include <c10/util/Optional.h>
|
| 10 |
+
#include <c10/core/QScheme.h>
|
| 11 |
+
#include <ATen/core/Reduction.h>
|
| 12 |
+
#include <ATen/core/Tensor.h>
|
| 13 |
+
#include <tuple>
|
| 14 |
+
#include <vector>
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
namespace at {
|
| 18 |
+
namespace native {
|
| 19 |
+
TORCH_API at::Tensor _sparse_bsc_tensor_unsafe(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype={}, c10::optional<at::Layout> layout={}, c10::optional<at::Device> device={}, c10::optional<bool> pin_memory={});
|
| 20 |
+
} // namespace native
|
| 21 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe_compositeimplicitautograd_dispatch.h
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace compositeimplicitautograd {
|
| 19 |
+
|
| 20 |
+
TORCH_API at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={});
|
| 21 |
+
TORCH_API at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
|
| 22 |
+
TORCH_API at::Tensor _sparse_compressed_tensor_unsafe_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options={});
|
| 23 |
+
TORCH_API at::Tensor _sparse_compressed_tensor_unsafe_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
|
| 24 |
+
|
| 25 |
+
} // namespace compositeimplicitautograd
|
| 26 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_thnn_differentiable_lstm_cell_backward_native.h
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from NativeFunction.h
|
| 4 |
+
|
| 5 |
+
#include <c10/core/Scalar.h>
|
| 6 |
+
#include <c10/core/Storage.h>
|
| 7 |
+
#include <c10/core/TensorOptions.h>
|
| 8 |
+
#include <c10/util/Deprecated.h>
|
| 9 |
+
#include <c10/util/Optional.h>
|
| 10 |
+
#include <c10/core/QScheme.h>
|
| 11 |
+
#include <ATen/core/Reduction.h>
|
| 12 |
+
#include <ATen/core/Tensor.h>
|
| 13 |
+
#include <tuple>
|
| 14 |
+
#include <vector>
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
namespace at {
|
| 18 |
+
namespace native {
|
| 19 |
+
TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _thnn_differentiable_lstm_cell_backward(const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const c10::optional<at::Tensor> & input_bias, const c10::optional<at::Tensor> & hidden_bias, const at::Tensor & cx, const at::Tensor & cy);
|
| 20 |
+
} // namespace native
|
| 21 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_bsr_ops.h
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Operator.h
|
| 4 |
+
|
| 5 |
+
#include <tuple>
|
| 6 |
+
#include <vector>
|
| 7 |
+
|
| 8 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 9 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 10 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 11 |
+
#include <ATen/core/ATen_fwd.h>
|
| 12 |
+
|
| 13 |
+
namespace at {
|
| 14 |
+
namespace _ops {
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
struct TORCH_API _to_sparse_bsr {
|
| 18 |
+
using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, c10::optional<int64_t>);
|
| 19 |
+
using ptr_schema = schema*;
|
| 20 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 21 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_to_sparse_bsr")
|
| 22 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
|
| 23 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor")
|
| 24 |
+
static at::Tensor call(const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim);
|
| 25 |
+
static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim);
|
| 26 |
+
};
|
| 27 |
+
|
| 28 |
+
struct TORCH_API _to_sparse_bsr_out {
|
| 29 |
+
using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, c10::optional<int64_t>, at::Tensor &);
|
| 30 |
+
using ptr_schema = schema*;
|
| 31 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 32 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_to_sparse_bsr")
|
| 33 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
|
| 34 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_to_sparse_bsr.out(Tensor self, int[2] blocksize, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)")
|
| 35 |
+
static at::Tensor & call(const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim, at::Tensor & out);
|
| 36 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim, at::Tensor & out);
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
}} // namespace at::_ops
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_cuda_dispatch.h
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace cuda {
|
| 19 |
+
|
| 20 |
+
TORCH_API at::Tensor _to_sparse(const at::Tensor & self, int64_t sparse_dim);
|
| 21 |
+
TORCH_API at::Tensor _to_sparse(const at::Tensor & self, c10::optional<at::Layout> layout=c10::nullopt, at::OptionalIntArrayRef blocksize=c10::nullopt, c10::optional<int64_t> dense_dim=c10::nullopt);
|
| 22 |
+
|
| 23 |
+
} // namespace cuda
|
| 24 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward.h
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Function.h
|
| 4 |
+
|
| 5 |
+
#include <ATen/Context.h>
|
| 6 |
+
#include <ATen/DeviceGuard.h>
|
| 7 |
+
#include <ATen/TensorUtils.h>
|
| 8 |
+
#include <ATen/TracerMode.h>
|
| 9 |
+
#include <ATen/core/Generator.h>
|
| 10 |
+
#include <ATen/core/Reduction.h>
|
| 11 |
+
#include <ATen/core/Tensor.h>
|
| 12 |
+
#include <c10/core/Scalar.h>
|
| 13 |
+
#include <c10/core/Storage.h>
|
| 14 |
+
#include <c10/core/TensorOptions.h>
|
| 15 |
+
#include <c10/util/Deprecated.h>
|
| 16 |
+
#include <c10/util/Optional.h>
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
#include <ATen/ops/_upsample_bicubic2d_aa_backward_ops.h>
|
| 21 |
+
|
| 22 |
+
namespace at {
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
// aten::_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
|
| 26 |
+
inline at::Tensor & _upsample_bicubic2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
|
| 27 |
+
return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
|
| 28 |
+
}
|
| 29 |
+
namespace symint {
|
| 30 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
|
| 31 |
+
at::Tensor & _upsample_bicubic2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
|
| 32 |
+
return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
// aten::_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
|
| 37 |
+
inline at::Tensor & _upsample_bicubic2d_aa_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
|
| 38 |
+
return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
|
| 39 |
+
}
|
| 40 |
+
namespace symint {
|
| 41 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
|
| 42 |
+
at::Tensor & _upsample_bicubic2d_aa_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
|
| 43 |
+
return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
// aten::_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
|
| 48 |
+
inline at::Tensor & _upsample_bicubic2d_aa_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
|
| 49 |
+
return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
|
| 50 |
+
}
|
| 51 |
+
namespace symint {
|
| 52 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
|
| 53 |
+
at::Tensor & _upsample_bicubic2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
|
| 54 |
+
return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
// aten::_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
|
| 59 |
+
inline at::Tensor & _upsample_bicubic2d_aa_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
|
| 60 |
+
return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
|
| 61 |
+
}
|
| 62 |
+
namespace symint {
|
| 63 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
|
| 64 |
+
at::Tensor & _upsample_bicubic2d_aa_backward_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
|
| 65 |
+
return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
// aten::_upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
|
| 70 |
+
inline at::Tensor _upsample_bicubic2d_aa_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
|
| 71 |
+
return at::_ops::_upsample_bicubic2d_aa_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w);
|
| 72 |
+
}
|
| 73 |
+
namespace symint {
|
| 74 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
|
| 75 |
+
at::Tensor _upsample_bicubic2d_aa_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
|
| 76 |
+
return at::_ops::_upsample_bicubic2d_aa_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w);
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
// aten::_upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
|
| 81 |
+
inline at::Tensor _upsample_bicubic2d_aa_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
|
| 82 |
+
return at::_ops::_upsample_bicubic2d_aa_backward::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w);
|
| 83 |
+
}
|
| 84 |
+
namespace symint {
|
| 85 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
|
| 86 |
+
at::Tensor _upsample_bicubic2d_aa_backward(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
|
| 87 |
+
return at::_ops::_upsample_bicubic2d_aa_backward::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w);
|
| 88 |
+
}
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
}
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/all_ops.h
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Operator.h
|
| 4 |
+
|
| 5 |
+
#include <tuple>
|
| 6 |
+
#include <vector>
|
| 7 |
+
|
| 8 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 9 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 10 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 11 |
+
#include <ATen/core/ATen_fwd.h>
|
| 12 |
+
|
| 13 |
+
namespace at {
|
| 14 |
+
namespace _ops {
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
struct TORCH_API all_dim {
|
| 18 |
+
using schema = at::Tensor (const at::Tensor &, int64_t, bool);
|
| 19 |
+
using ptr_schema = schema*;
|
| 20 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 21 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
|
| 22 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "dim")
|
| 23 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor")
|
| 24 |
+
static at::Tensor call(const at::Tensor & self, int64_t dim, bool keepdim);
|
| 25 |
+
static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim);
|
| 26 |
+
};
|
| 27 |
+
|
| 28 |
+
struct TORCH_API all_dims {
|
| 29 |
+
using schema = at::Tensor (const at::Tensor &, at::OptionalIntArrayRef, bool);
|
| 30 |
+
using ptr_schema = schema*;
|
| 31 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 32 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
|
| 33 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "dims")
|
| 34 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor")
|
| 35 |
+
static at::Tensor call(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim);
|
| 36 |
+
static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim);
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
struct TORCH_API all_out {
|
| 40 |
+
using schema = at::Tensor & (const at::Tensor &, int64_t, bool, at::Tensor &);
|
| 41 |
+
using ptr_schema = schema*;
|
| 42 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 43 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
|
| 44 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
|
| 45 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)")
|
| 46 |
+
static at::Tensor & call(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & out);
|
| 47 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & out);
|
| 48 |
+
};
|
| 49 |
+
|
| 50 |
+
struct TORCH_API all_dims_out {
|
| 51 |
+
using schema = at::Tensor & (const at::Tensor &, at::OptionalIntArrayRef, bool, at::Tensor &);
|
| 52 |
+
using ptr_schema = schema*;
|
| 53 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 54 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
|
| 55 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "dims_out")
|
| 56 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)")
|
| 57 |
+
static at::Tensor & call(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, at::Tensor & out);
|
| 58 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, at::Tensor & out);
|
| 59 |
+
};
|
| 60 |
+
|
| 61 |
+
struct TORCH_API all_dimname {
|
| 62 |
+
using schema = at::Tensor (const at::Tensor &, at::Dimname, bool);
|
| 63 |
+
using ptr_schema = schema*;
|
| 64 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 65 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
|
| 66 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "dimname")
|
| 67 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor")
|
| 68 |
+
static at::Tensor call(const at::Tensor & self, at::Dimname dim, bool keepdim);
|
| 69 |
+
static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim);
|
| 70 |
+
};
|
| 71 |
+
|
| 72 |
+
struct TORCH_API all_dimname_out {
|
| 73 |
+
using schema = at::Tensor & (const at::Tensor &, at::Dimname, bool, at::Tensor &);
|
| 74 |
+
using ptr_schema = schema*;
|
| 75 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 76 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
|
| 77 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "dimname_out")
|
| 78 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)")
|
| 79 |
+
static at::Tensor & call(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & out);
|
| 80 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & out);
|
| 81 |
+
};
|
| 82 |
+
|
| 83 |
+
struct TORCH_API all {
|
| 84 |
+
using schema = at::Tensor (const at::Tensor &);
|
| 85 |
+
using ptr_schema = schema*;
|
| 86 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 87 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
|
| 88 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
|
| 89 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all(Tensor self) -> Tensor")
|
| 90 |
+
static at::Tensor call(const at::Tensor & self);
|
| 91 |
+
static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
|
| 92 |
+
};
|
| 93 |
+
|
| 94 |
+
struct TORCH_API all_all_out {
|
| 95 |
+
using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
|
| 96 |
+
using ptr_schema = schema*;
|
| 97 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 98 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
|
| 99 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "all_out")
|
| 100 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
|
| 101 |
+
static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
|
| 102 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
|
| 103 |
+
};
|
| 104 |
+
|
| 105 |
+
}} // namespace at::_ops
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arctan2_ops.h
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Operator.h
|
| 4 |
+
|
| 5 |
+
#include <tuple>
|
| 6 |
+
#include <vector>
|
| 7 |
+
|
| 8 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 9 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 10 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 11 |
+
#include <ATen/core/ATen_fwd.h>
|
| 12 |
+
|
| 13 |
+
namespace at {
|
| 14 |
+
namespace _ops {
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
struct TORCH_API arctan2 {
|
| 18 |
+
using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
|
| 19 |
+
using ptr_schema = schema*;
|
| 20 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 21 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::arctan2")
|
| 22 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
|
| 23 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "arctan2(Tensor self, Tensor other) -> Tensor")
|
| 24 |
+
static at::Tensor call(const at::Tensor & self, const at::Tensor & other);
|
| 25 |
+
static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other);
|
| 26 |
+
};
|
| 27 |
+
|
| 28 |
+
struct TORCH_API arctan2_out {
|
| 29 |
+
using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &);
|
| 30 |
+
using ptr_schema = schema*;
|
| 31 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 32 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::arctan2")
|
| 33 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
|
| 34 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "arctan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)")
|
| 35 |
+
static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
|
| 36 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
struct TORCH_API arctan2_ {
|
| 40 |
+
using schema = at::Tensor & (at::Tensor &, const at::Tensor &);
|
| 41 |
+
using ptr_schema = schema*;
|
| 42 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 43 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::arctan2_")
|
| 44 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
|
| 45 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "arctan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)")
|
| 46 |
+
static at::Tensor & call(at::Tensor & self, const at::Tensor & other);
|
| 47 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other);
|
| 48 |
+
};
|
| 49 |
+
|
| 50 |
+
}} // namespace at::_ops
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/asin_compositeexplicitautogradnonfunctional_dispatch.h
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace compositeexplicitautogradnonfunctional {
|
| 19 |
+
|
| 20 |
+
TORCH_API at::Tensor asin(const at::Tensor & self);
|
| 21 |
+
TORCH_API at::Tensor & asin_(at::Tensor & self);
|
| 22 |
+
|
| 23 |
+
} // namespace compositeexplicitautogradnonfunctional
|
| 24 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/avg_pool2d_backward_native.h
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from NativeFunction.h
|
| 4 |
+
|
| 5 |
+
#include <c10/core/Scalar.h>
|
| 6 |
+
#include <c10/core/Storage.h>
|
| 7 |
+
#include <c10/core/TensorOptions.h>
|
| 8 |
+
#include <c10/util/Deprecated.h>
|
| 9 |
+
#include <c10/util/Optional.h>
|
| 10 |
+
#include <c10/core/QScheme.h>
|
| 11 |
+
#include <ATen/core/Reduction.h>
|
| 12 |
+
#include <ATen/core/Tensor.h>
|
| 13 |
+
#include <tuple>
|
| 14 |
+
#include <vector>
|
| 15 |
+
#include <ATen/ops/avg_pool2d_backward_meta.h>
|
| 16 |
+
|
| 17 |
+
namespace at {
|
| 18 |
+
namespace native {
|
| 19 |
+
struct TORCH_API structured_avg_pool2d_backward_out_cpu : public at::meta::structured_avg_pool2d_backward {
|
| 20 |
+
void impl(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, const at::Tensor & grad_input);
|
| 21 |
+
};
|
| 22 |
+
struct TORCH_API structured_avg_pool2d_backward_out_cuda : public at::meta::structured_avg_pool2d_backward {
|
| 23 |
+
void impl(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, const at::Tensor & grad_input);
|
| 24 |
+
};
|
| 25 |
+
TORCH_API at::Tensor mkldnn_avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override);
|
| 26 |
+
TORCH_API at::Tensor & mkldnn_avg_pool2d_backward_out(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, at::Tensor & grad_input);
|
| 27 |
+
} // namespace native
|
| 28 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_xor_meta.h
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from NativeMetaFunction.h
|
| 4 |
+
|
| 5 |
+
#include <c10/core/Scalar.h>
|
| 6 |
+
#include <c10/core/Storage.h>
|
| 7 |
+
#include <c10/core/TensorOptions.h>
|
| 8 |
+
#include <c10/util/Deprecated.h>
|
| 9 |
+
#include <c10/util/Optional.h>
|
| 10 |
+
#include <c10/core/QScheme.h>
|
| 11 |
+
#include <ATen/core/Reduction.h>
|
| 12 |
+
#include <ATen/TensorIterator.h>
|
| 13 |
+
#include <ATen/TensorMeta.h>
|
| 14 |
+
#include <tuple>
|
| 15 |
+
#include <vector>
|
| 16 |
+
|
| 17 |
+
namespace at {
|
| 18 |
+
namespace meta {
|
| 19 |
+
|
| 20 |
+
struct TORCH_API structured_bitwise_xor_Tensor : public TensorIteratorBase {
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
void meta(const at::Tensor & self, const at::Tensor & other);
|
| 24 |
+
};
|
| 25 |
+
|
| 26 |
+
} // namespace native
|
| 27 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ceil_meta.h
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from NativeMetaFunction.h
|
| 4 |
+
|
| 5 |
+
#include <c10/core/Scalar.h>
|
| 6 |
+
#include <c10/core/Storage.h>
|
| 7 |
+
#include <c10/core/TensorOptions.h>
|
| 8 |
+
#include <c10/util/Deprecated.h>
|
| 9 |
+
#include <c10/util/Optional.h>
|
| 10 |
+
#include <c10/core/QScheme.h>
|
| 11 |
+
#include <ATen/core/Reduction.h>
|
| 12 |
+
#include <ATen/TensorIterator.h>
|
| 13 |
+
#include <ATen/TensorMeta.h>
|
| 14 |
+
#include <tuple>
|
| 15 |
+
#include <vector>
|
| 16 |
+
|
| 17 |
+
namespace at {
|
| 18 |
+
namespace meta {
|
| 19 |
+
|
| 20 |
+
struct TORCH_API structured_ceil : public TensorIteratorBase {
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
void meta(const at::Tensor & self);
|
| 24 |
+
};
|
| 25 |
+
|
| 26 |
+
} // namespace native
|
| 27 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/clamp_ops.h
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Operator.h
|
| 4 |
+
|
| 5 |
+
#include <tuple>
|
| 6 |
+
#include <vector>
|
| 7 |
+
|
| 8 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 9 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 10 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 11 |
+
#include <ATen/core/ATen_fwd.h>
|
| 12 |
+
|
| 13 |
+
namespace at {
|
| 14 |
+
namespace _ops {
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
struct TORCH_API clamp {
|
| 18 |
+
using schema = at::Tensor (const at::Tensor &, const c10::optional<at::Scalar> &, const c10::optional<at::Scalar> &);
|
| 19 |
+
using ptr_schema = schema*;
|
| 20 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 21 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::clamp")
|
| 22 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
|
| 23 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor")
|
| 24 |
+
static at::Tensor call(const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max);
|
| 25 |
+
static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max);
|
| 26 |
+
};
|
| 27 |
+
|
| 28 |
+
struct TORCH_API clamp_Tensor {
|
| 29 |
+
using schema = at::Tensor (const at::Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &);
|
| 30 |
+
using ptr_schema = schema*;
|
| 31 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 32 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::clamp")
|
| 33 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor")
|
| 34 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor")
|
| 35 |
+
static at::Tensor call(const at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max);
|
| 36 |
+
static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max);
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
struct TORCH_API clamp_ {
|
| 40 |
+
using schema = at::Tensor & (at::Tensor &, const c10::optional<at::Scalar> &, const c10::optional<at::Scalar> &);
|
| 41 |
+
using ptr_schema = schema*;
|
| 42 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 43 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::clamp_")
|
| 44 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
|
| 45 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)")
|
| 46 |
+
static at::Tensor & call(at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max);
|
| 47 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max);
|
| 48 |
+
};
|
| 49 |
+
|
| 50 |
+
struct TORCH_API clamp__Tensor {
|
| 51 |
+
using schema = at::Tensor & (at::Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &);
|
| 52 |
+
using ptr_schema = schema*;
|
| 53 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 54 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::clamp_")
|
| 55 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor")
|
| 56 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)")
|
| 57 |
+
static at::Tensor & call(at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max);
|
| 58 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max);
|
| 59 |
+
};
|
| 60 |
+
|
| 61 |
+
struct TORCH_API clamp_out {
|
| 62 |
+
using schema = at::Tensor & (const at::Tensor &, const c10::optional<at::Scalar> &, const c10::optional<at::Scalar> &, at::Tensor &);
|
| 63 |
+
using ptr_schema = schema*;
|
| 64 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 65 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::clamp")
|
| 66 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
|
| 67 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)")
|
| 68 |
+
static at::Tensor & call(const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max, at::Tensor & out);
|
| 69 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max, at::Tensor & out);
|
| 70 |
+
};
|
| 71 |
+
|
| 72 |
+
struct TORCH_API clamp_Tensor_out {
|
| 73 |
+
using schema = at::Tensor & (const at::Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, at::Tensor &);
|
| 74 |
+
using ptr_schema = schema*;
|
| 75 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 76 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::clamp")
|
| 77 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor_out")
|
| 78 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)")
|
| 79 |
+
static at::Tensor & call(const at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max, at::Tensor & out);
|
| 80 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max, at::Tensor & out);
|
| 81 |
+
};
|
| 82 |
+
|
| 83 |
+
}} // namespace at::_ops
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/convolution_backward.h
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Function.h
|
| 4 |
+
|
| 5 |
+
#include <ATen/Context.h>
|
| 6 |
+
#include <ATen/DeviceGuard.h>
|
| 7 |
+
#include <ATen/TensorUtils.h>
|
| 8 |
+
#include <ATen/TracerMode.h>
|
| 9 |
+
#include <ATen/core/Generator.h>
|
| 10 |
+
#include <ATen/core/Reduction.h>
|
| 11 |
+
#include <ATen/core/Tensor.h>
|
| 12 |
+
#include <c10/core/Scalar.h>
|
| 13 |
+
#include <c10/core/Storage.h>
|
| 14 |
+
#include <c10/core/TensorOptions.h>
|
| 15 |
+
#include <c10/util/Deprecated.h>
|
| 16 |
+
#include <c10/util/Optional.h>
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
#include <ATen/ops/convolution_backward_ops.h>
|
| 21 |
+
|
| 22 |
+
namespace at {
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
// aten::convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
|
| 26 |
+
inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
|
| 27 |
+
return at::_ops::convolution_backward::call(grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask);
|
| 28 |
+
}
|
| 29 |
+
namespace symint {
|
| 30 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
|
| 31 |
+
::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
|
| 32 |
+
return at::_ops::convolution_backward::call(grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask);
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
// aten::convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
|
| 37 |
+
inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward_symint(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
|
| 38 |
+
return at::_ops::convolution_backward::call(grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask);
|
| 39 |
+
}
|
| 40 |
+
namespace symint {
|
| 41 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
|
| 42 |
+
::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
|
| 43 |
+
return at::_ops::convolution_backward::call(grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask);
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
// aten::convolution_backward.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
|
| 48 |
+
inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
|
| 49 |
+
return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask, out0, out1, out2);
|
| 50 |
+
}
|
| 51 |
+
namespace symint {
|
| 52 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
|
| 53 |
+
::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
|
| 54 |
+
return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask, out0, out1, out2);
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
// aten::convolution_backward.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
|
| 59 |
+
inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
|
| 60 |
+
return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask, out0, out1, out2);
|
| 61 |
+
}
|
| 62 |
+
namespace symint {
|
| 63 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
|
| 64 |
+
::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
|
| 65 |
+
return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask, out0, out1, out2);
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
// aten::convolution_backward.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
|
| 70 |
+
inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_symint_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
|
| 71 |
+
return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask, out0, out1, out2);
|
| 72 |
+
}
|
| 73 |
+
namespace symint {
|
| 74 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
|
| 75 |
+
::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
|
| 76 |
+
return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask, out0, out1, out2);
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
// aten::convolution_backward.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
|
| 81 |
+
inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
|
| 82 |
+
return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask, out0, out1, out2);
|
| 83 |
+
}
|
| 84 |
+
namespace symint {
|
| 85 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
|
| 86 |
+
::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
|
| 87 |
+
return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask, out0, out1, out2);
|
| 88 |
+
}
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
}
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/convolution_backward_cuda_dispatch.h
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace cuda {
|
| 19 |
+
|
| 20 |
+
TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask);
|
| 21 |
+
TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward_symint(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask);
|
| 22 |
+
|
| 23 |
+
} // namespace cuda
|
| 24 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_add_relu.h
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Function.h
|
| 4 |
+
|
| 5 |
+
#include <ATen/Context.h>
|
| 6 |
+
#include <ATen/DeviceGuard.h>
|
| 7 |
+
#include <ATen/TensorUtils.h>
|
| 8 |
+
#include <ATen/TracerMode.h>
|
| 9 |
+
#include <ATen/core/Generator.h>
|
| 10 |
+
#include <ATen/core/Reduction.h>
|
| 11 |
+
#include <ATen/core/Tensor.h>
|
| 12 |
+
#include <c10/core/Scalar.h>
|
| 13 |
+
#include <c10/core/Storage.h>
|
| 14 |
+
#include <c10/core/TensorOptions.h>
|
| 15 |
+
#include <c10/util/Deprecated.h>
|
| 16 |
+
#include <c10/util/Optional.h>
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
#include <ATen/ops/cudnn_convolution_add_relu_ops.h>
|
| 21 |
+
|
| 22 |
+
namespace at {
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
// aten::cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
|
| 26 |
+
inline at::Tensor cudnn_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
|
| 27 |
+
return at::_ops::cudnn_convolution_add_relu::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
|
| 28 |
+
}
|
| 29 |
+
namespace symint {
|
| 30 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
|
| 31 |
+
at::Tensor cudnn_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
|
| 32 |
+
return at::_ops::cudnn_convolution_add_relu::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
// aten::cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
|
| 37 |
+
inline at::Tensor cudnn_convolution_add_relu_symint(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
|
| 38 |
+
return at::_ops::cudnn_convolution_add_relu::call(self, weight, z, alpha, bias, stride, padding, dilation, groups);
|
| 39 |
+
}
|
| 40 |
+
namespace symint {
|
| 41 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
|
| 42 |
+
at::Tensor cudnn_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
|
| 43 |
+
return at::_ops::cudnn_convolution_add_relu::call(self, weight, z, alpha, bias, stride, padding, dilation, groups);
|
| 44 |
+
}
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
// aten::cudnn_convolution_add_relu.out(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
|
| 48 |
+
inline at::Tensor & cudnn_convolution_add_relu_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
|
| 49 |
+
return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups, out);
|
| 50 |
+
}
|
| 51 |
+
namespace symint {
|
| 52 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
|
| 53 |
+
at::Tensor & cudnn_convolution_add_relu_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
|
| 54 |
+
return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups, out);
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
// aten::cudnn_convolution_add_relu.out(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
|
| 59 |
+
inline at::Tensor & cudnn_convolution_add_relu_outf(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) {
|
| 60 |
+
return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups, out);
|
| 61 |
+
}
|
| 62 |
+
namespace symint {
|
| 63 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
|
| 64 |
+
at::Tensor & cudnn_convolution_add_relu_outf(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) {
|
| 65 |
+
return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups, out);
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
// aten::cudnn_convolution_add_relu.out(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
|
| 70 |
+
inline at::Tensor & cudnn_convolution_add_relu_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
|
| 71 |
+
return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, stride, padding, dilation, groups, out);
|
| 72 |
+
}
|
| 73 |
+
namespace symint {
|
| 74 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
|
| 75 |
+
at::Tensor & cudnn_convolution_add_relu_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
|
| 76 |
+
return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, stride, padding, dilation, groups, out);
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
// aten::cudnn_convolution_add_relu.out(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
|
| 81 |
+
inline at::Tensor & cudnn_convolution_add_relu_symint_outf(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) {
|
| 82 |
+
return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, stride, padding, dilation, groups, out);
|
| 83 |
+
}
|
| 84 |
+
namespace symint {
|
| 85 |
+
template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
|
| 86 |
+
at::Tensor & cudnn_convolution_add_relu_outf(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) {
|
| 87 |
+
return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, stride, padding, dilation, groups, out);
|
| 88 |
+
}
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
}
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_add_relu_native.h
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from NativeFunction.h
|
| 4 |
+
|
| 5 |
+
#include <c10/core/Scalar.h>
|
| 6 |
+
#include <c10/core/Storage.h>
|
| 7 |
+
#include <c10/core/TensorOptions.h>
|
| 8 |
+
#include <c10/util/Deprecated.h>
|
| 9 |
+
#include <c10/util/Optional.h>
|
| 10 |
+
#include <c10/core/QScheme.h>
|
| 11 |
+
#include <ATen/core/Reduction.h>
|
| 12 |
+
#include <ATen/core/Tensor.h>
|
| 13 |
+
#include <tuple>
|
| 14 |
+
#include <vector>
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
namespace at {
|
| 18 |
+
namespace native {
|
| 19 |
+
TORCH_API at::Tensor & cudnn_convolution_add_relu_out_symint(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out);
|
| 20 |
+
TORCH_API at::Tensor cudnn_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups);
|
| 21 |
+
} // namespace native
|
| 22 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_native.h
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from NativeFunction.h
|
| 4 |
+
|
| 5 |
+
#include <c10/core/Scalar.h>
|
| 6 |
+
#include <c10/core/Storage.h>
|
| 7 |
+
#include <c10/core/TensorOptions.h>
|
| 8 |
+
#include <c10/util/Deprecated.h>
|
| 9 |
+
#include <c10/util/Optional.h>
|
| 10 |
+
#include <c10/core/QScheme.h>
|
| 11 |
+
#include <ATen/core/Reduction.h>
|
| 12 |
+
#include <ATen/core/Tensor.h>
|
| 13 |
+
#include <tuple>
|
| 14 |
+
#include <vector>
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
namespace at {
|
| 18 |
+
namespace native {
|
| 19 |
+
TORCH_API at::Tensor cudnn_convolution(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32);
|
| 20 |
+
TORCH_API at::Tensor & cudnn_convolution_out(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out);
|
| 21 |
+
} // namespace native
|
| 22 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/eye_compositeexplicitautograd_dispatch.h
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace compositeexplicitautograd {
|
| 19 |
+
|
| 20 |
+
TORCH_API at::Tensor eye(int64_t n, at::TensorOptions options={});
|
| 21 |
+
TORCH_API at::Tensor eye(int64_t n, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
|
| 22 |
+
TORCH_API at::Tensor eye_symint(c10::SymInt n, at::TensorOptions options={});
|
| 23 |
+
TORCH_API at::Tensor eye_symint(c10::SymInt n, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
|
| 24 |
+
TORCH_API at::Tensor eye(int64_t n, int64_t m, at::TensorOptions options={});
|
| 25 |
+
TORCH_API at::Tensor eye(int64_t n, int64_t m, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
|
| 26 |
+
TORCH_API at::Tensor eye_symint(c10::SymInt n, c10::SymInt m, at::TensorOptions options={});
|
| 27 |
+
TORCH_API at::Tensor eye_symint(c10::SymInt n, c10::SymInt m, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
|
| 28 |
+
|
| 29 |
+
} // namespace compositeexplicitautograd
|
| 30 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fbgemm_pack_gemm_matrix_fp16.h
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Function.h
|
| 4 |
+
|
| 5 |
+
#include <ATen/Context.h>
|
| 6 |
+
#include <ATen/DeviceGuard.h>
|
| 7 |
+
#include <ATen/TensorUtils.h>
|
| 8 |
+
#include <ATen/TracerMode.h>
|
| 9 |
+
#include <ATen/core/Generator.h>
|
| 10 |
+
#include <ATen/core/Reduction.h>
|
| 11 |
+
#include <ATen/core/Tensor.h>
|
| 12 |
+
#include <c10/core/Scalar.h>
|
| 13 |
+
#include <c10/core/Storage.h>
|
| 14 |
+
#include <c10/core/TensorOptions.h>
|
| 15 |
+
#include <c10/util/Deprecated.h>
|
| 16 |
+
#include <c10/util/Optional.h>
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
#include <ATen/ops/fbgemm_pack_gemm_matrix_fp16_ops.h>
|
| 21 |
+
|
| 22 |
+
namespace at {
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
// aten::fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor
|
| 26 |
+
inline at::Tensor fbgemm_pack_gemm_matrix_fp16(const at::Tensor & input) {
|
| 27 |
+
return at::_ops::fbgemm_pack_gemm_matrix_fp16::call(input);
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
}
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_irfftn_ops.h
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Operator.h
|
| 4 |
+
|
| 5 |
+
#include <tuple>
|
| 6 |
+
#include <vector>
|
| 7 |
+
|
| 8 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 9 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 10 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 11 |
+
#include <ATen/core/ATen_fwd.h>
|
| 12 |
+
|
| 13 |
+
namespace at {
|
| 14 |
+
namespace _ops {
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
struct TORCH_API fft_irfftn {
|
| 18 |
+
using schema = at::Tensor (const at::Tensor &, at::OptionalSymIntArrayRef, at::OptionalIntArrayRef, c10::optional<c10::string_view>);
|
| 19 |
+
using ptr_schema = schema*;
|
| 20 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 21 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::fft_irfftn")
|
| 22 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
|
| 23 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "fft_irfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor")
|
| 24 |
+
static at::Tensor call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm);
|
| 25 |
+
static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm);
|
| 26 |
+
};
|
| 27 |
+
|
| 28 |
+
struct TORCH_API fft_irfftn_out {
|
| 29 |
+
using schema = at::Tensor & (const at::Tensor &, at::OptionalSymIntArrayRef, at::OptionalIntArrayRef, c10::optional<c10::string_view>, at::Tensor &);
|
| 30 |
+
using ptr_schema = schema*;
|
| 31 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 32 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::fft_irfftn")
|
| 33 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
|
| 34 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)")
|
| 35 |
+
static at::Tensor & call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out);
|
| 36 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out);
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
}} // namespace at::_ops
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fractional_max_pool3d_cpu_dispatch.h
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace cpu {
|
| 19 |
+
|
| 20 |
+
TORCH_API ::std::tuple<at::Tensor,at::Tensor> fractional_max_pool3d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples);
|
| 21 |
+
TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> fractional_max_pool3d_out(at::Tensor & output, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples);
|
| 22 |
+
TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> fractional_max_pool3d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples, at::Tensor & output, at::Tensor & indices);
|
| 23 |
+
|
| 24 |
+
} // namespace cpu
|
| 25 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/glu_jvp_ops.h
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Operator.h
|
| 4 |
+
|
| 5 |
+
#include <tuple>
|
| 6 |
+
#include <vector>
|
| 7 |
+
|
| 8 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 9 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 10 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 11 |
+
#include <ATen/core/ATen_fwd.h>
|
| 12 |
+
|
| 13 |
+
namespace at {
|
| 14 |
+
namespace _ops {
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
struct TORCH_API glu_jvp {
|
| 18 |
+
using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t);
|
| 19 |
+
using ptr_schema = schema*;
|
| 20 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 21 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::glu_jvp")
|
| 22 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
|
| 23 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor")
|
| 24 |
+
static at::Tensor call(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim);
|
| 25 |
+
static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim);
|
| 26 |
+
};
|
| 27 |
+
|
| 28 |
+
struct TORCH_API glu_jvp_out {
|
| 29 |
+
using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &);
|
| 30 |
+
using ptr_schema = schema*;
|
| 31 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 32 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::glu_jvp")
|
| 33 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
|
| 34 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "glu_jvp.out(Tensor glu, Tensor x, Tensor dx, int dim, *, Tensor(a!) out) -> Tensor(a!)")
|
| 35 |
+
static at::Tensor & call(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim, at::Tensor & out);
|
| 36 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim, at::Tensor & out);
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
}} // namespace at::_ops
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hardswish_backward_cpu_dispatch.h
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace cpu {
|
| 19 |
+
|
| 20 |
+
TORCH_API at::Tensor hardswish_backward(const at::Tensor & grad_output, const at::Tensor & self);
|
| 21 |
+
|
| 22 |
+
} // namespace cpu
|
| 23 |
+
} // namespace at
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hardswish_ops.h
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
|
| 3 |
+
// @generated by torchgen/gen.py from Operator.h
|
| 4 |
+
|
| 5 |
+
#include <tuple>
|
| 6 |
+
#include <vector>
|
| 7 |
+
|
| 8 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 9 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 10 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 11 |
+
#include <ATen/core/ATen_fwd.h>
|
| 12 |
+
|
| 13 |
+
namespace at {
|
| 14 |
+
namespace _ops {
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
struct TORCH_API hardswish_out {
|
| 18 |
+
using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
|
| 19 |
+
using ptr_schema = schema*;
|
| 20 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 21 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::hardswish")
|
| 22 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
|
| 23 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
|
| 24 |
+
static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
|
| 25 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
|
| 26 |
+
};
|
| 27 |
+
|
| 28 |
+
struct TORCH_API hardswish {
|
| 29 |
+
using schema = at::Tensor (const at::Tensor &);
|
| 30 |
+
using ptr_schema = schema*;
|
| 31 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 32 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::hardswish")
|
| 33 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
|
| 34 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "hardswish(Tensor self) -> Tensor")
|
| 35 |
+
static at::Tensor call(const at::Tensor & self);
|
| 36 |
+
static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
struct TORCH_API hardswish_ {
|
| 40 |
+
using schema = at::Tensor & (at::Tensor &);
|
| 41 |
+
using ptr_schema = schema*;
|
| 42 |
+
// See Note [static constexpr char* members for windows NVCC]
|
| 43 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::hardswish_")
|
| 44 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
|
| 45 |
+
STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "hardswish_(Tensor(a!) self) -> Tensor(a!)")
|
| 46 |
+
static at::Tensor & call(at::Tensor & self);
|
| 47 |
+
static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
|
| 48 |
+
};
|
| 49 |
+
|
| 50 |
+
}} // namespace at::_ops
|
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_put_compositeexplicitautograd_dispatch.h
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma once
|
| 2 |
+
// @generated by torchgen/gen.py from DispatchKeyFunction.h
|
| 3 |
+
|
| 4 |
+
// NB: The implementing C++ file is RegisterDispatchKey.cpp
|
| 5 |
+
|
| 6 |
+
// The only #includes we need are for custom classes that have defaults in the C++ API
|
| 7 |
+
#include <c10/core/MemoryFormat.h>
|
| 8 |
+
#include <c10/core/Scalar.h>
|
| 9 |
+
#include <ATen/core/Reduction.h>
|
| 10 |
+
|
| 11 |
+
// Forward declarations of any types needed in the operator signatures.
|
| 12 |
+
// We can't directly include these classes because it will cause circular include dependencies.
|
| 13 |
+
// This file is included by TensorBody.h, which defines the Tensor class.
|
| 14 |
+
#include <ATen/core/ATen_fwd.h>
|
| 15 |
+
|
| 16 |
+
namespace at {
|
| 17 |
+
|
| 18 |
+
namespace compositeexplicitautograd {
|
| 19 |
+
|
| 20 |
+
TORCH_API at::Tensor index_put(const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false);
|
| 21 |
+
TORCH_API at::Tensor & index_put_out(at::Tensor & out, const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false);
|
| 22 |
+
TORCH_API at::Tensor & index_put_outf(const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate, at::Tensor & out);
|
| 23 |
+
TORCH_API at::Tensor & index_put_(at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false);
|
| 24 |
+
|
| 25 |
+
} // namespace compositeexplicitautograd
|
| 26 |
+
} // namespace at
|