koichi12 commited on
Commit
e567151
·
verified ·
1 Parent(s): 0f50d34

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/CopyKernel.h +12 -0
  2. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Reduce.h +314 -0
  3. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SerialStackImpl.h +144 -0
  4. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/StackKernel.h +12 -0
  5. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/int_mm_kernel.h +16 -0
  6. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/MPSGraphVenturaOps.h +197 -0
  7. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorFactories.h +7 -0
  8. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorMath.h +81 -0
  9. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerUtils.h +44 -0
  10. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorUtils.h +415 -0
  11. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_add_relu_meta_dispatch.h +24 -0
  12. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cslt_compress_cuda_dispatch.h +23 -0
  13. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fft_c2r.h +91 -0
  14. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_addcdiv_compositeexplicitautograd_dispatch.h +28 -0
  15. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_cosh_ops.h +50 -0
  16. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_erf.h +44 -0
  17. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_minimum.h +82 -0
  18. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_trunc.h +44 -0
  19. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fw_primal.h +26 -0
  20. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_int_mm_cuda_dispatch.h +25 -0
  21. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_compositeexplicitautograd_dispatch.h +24 -0
  22. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_view_from_buffer_cpu_dispatch.h +23 -0
  23. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_remove_batch_dim.h +30 -0
  24. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_segment_reduce_backward_cpu_dispatch.h +23 -0
  25. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_bsc_tensor_unsafe_native.h +21 -0
  26. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe_compositeimplicitautograd_dispatch.h +26 -0
  27. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_thnn_differentiable_lstm_cell_backward_native.h +21 -0
  28. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_bsr_ops.h +39 -0
  29. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_cuda_dispatch.h +24 -0
  30. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward.h +91 -0
  31. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/all_ops.h +105 -0
  32. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arctan2_ops.h +50 -0
  33. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/asin_compositeexplicitautogradnonfunctional_dispatch.h +24 -0
  34. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/avg_pool2d_backward_native.h +28 -0
  35. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_xor_meta.h +27 -0
  36. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ceil_meta.h +27 -0
  37. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/clamp_ops.h +83 -0
  38. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/convolution_backward.h +91 -0
  39. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/convolution_backward_cuda_dispatch.h +24 -0
  40. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_add_relu.h +91 -0
  41. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_add_relu_native.h +22 -0
  42. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_native.h +22 -0
  43. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/eye_compositeexplicitautograd_dispatch.h +30 -0
  44. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fbgemm_pack_gemm_matrix_fp16.h +30 -0
  45. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_irfftn_ops.h +39 -0
  46. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fractional_max_pool3d_cpu_dispatch.h +25 -0
  47. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/glu_jvp_ops.h +39 -0
  48. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hardswish_backward_cpu_dispatch.h +23 -0
  49. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hardswish_ops.h +50 -0
  50. tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_put_compositeexplicitautograd_dispatch.h +26 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/CopyKernel.h ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ namespace at {
4
+ struct TensorIteratorBase;
5
+
6
+ namespace native {
7
+ inline namespace CPU_CAPABILITY {
8
+
9
+ void direct_copy_kernel(TensorIteratorBase &iter);
10
+ void copy_kernel(TensorIterator& iter, bool /*non_blocking*/);
11
+
12
+ }}} // namespace at::native::CPU_CAPABILITY
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Reduce.h ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <ATen/native/cpu/Loops.h>
4
+ #include <ATen/Parallel.h>
5
+ #include <c10/util/TypeList.h>
6
+ #include <c10/core/Scalar.h>
7
+ #include <c10/util/irange.h>
8
+
9
+ #include <sstream>
10
+ #include <type_traits>
11
+
12
+ namespace at { namespace native { inline namespace CPU_CAPABILITY {
13
+
14
+ using namespace vec;
15
+
16
+ #define VEC_LOOP_HEADER(func_t, data) \
17
+ using scalar_t = typename function_traits<func_t>::result_type; \
18
+ using Vec = Vectorized<scalar_t>; \
19
+ char* out_ptr = data[0]; \
20
+ (void) out_ptr;
21
+
22
+ // reduction that is contiguous over the input in dim 0
23
+ template <typename traits>
24
+ static inline bool is_contiguous_reduction(const int64_t* strides) {
25
+ return strides[0] == 0 &&
26
+ strides[1] == sizeof(typename traits::arg2_t);
27
+ }
28
+
29
+ // reduction that is contiguous over the input in dim 1
30
+ template <typename traits>
31
+ static inline bool is_outer_reduction(const int64_t* strides) {
32
+ return strides[0] == 0 &&
33
+ strides[2] == sizeof(typename traits::result_type) &&
34
+ strides[3] == sizeof(typename traits::arg2_t);
35
+ }
36
+
37
+ template <typename func_t, typename vec_func_t>
38
+ static inline void vectorized_reduction(char** data, int64_t n, int64_t stride,
39
+ func_t op, vec_func_t vop, bool reduce) {
40
+ VEC_LOOP_HEADER(func_t, data)
41
+ const char* in1_ptr = data[1];
42
+ Vec acc[4];
43
+ for (const auto j : c10::irange(4)) {
44
+ acc[j] = Vec::loadu(in1_ptr + j * Vec::size() * sizeof(scalar_t));
45
+ }
46
+ for (const auto i : c10::irange(1, n)) {
47
+ const char* ptr = in1_ptr + stride * i;
48
+ acc[0] = vop(acc[0], Vec::loadu(ptr + (0 * Vec::size() * sizeof(scalar_t))));
49
+ acc[1] = vop(acc[1], Vec::loadu(ptr + (1 * Vec::size() * sizeof(scalar_t))));
50
+ acc[2] = vop(acc[2], Vec::loadu(ptr + (2 * Vec::size() * sizeof(scalar_t))));
51
+ acc[3] = vop(acc[3], Vec::loadu(ptr + (3 * Vec::size() * sizeof(scalar_t))));
52
+ }
53
+ if (reduce) {
54
+ scalar_t buffer[Vec::size()];
55
+ acc[0] = vop(vop(acc[0], acc[1]), vop(acc[2], acc[3]));
56
+ acc[0].store(buffer);
57
+ for (const auto j : c10::irange(1, Vec::size())) {
58
+ buffer[0] = op(buffer[0], buffer[j]);
59
+ }
60
+ auto dst = (scalar_t*)out_ptr;
61
+ *dst = op(*dst, buffer[0]);
62
+ } else {
63
+ for (const auto j : c10::irange(4)) {
64
+ auto dst = out_ptr + j * Vec::size() * sizeof(scalar_t);
65
+ acc[j] = vop(acc[j], Vec::loadu(dst));
66
+ acc[j].store(dst);
67
+ }
68
+ }
69
+ }
70
+
71
+ template <typename F>
72
+ static inline void UNARY_OUTER_LOOP(char* data[2], const int64_t strides[2], int64_t n, F f) {
73
+ for (const auto j C10_UNUSED : c10::irange(n)) {
74
+ f();
75
+ data[0] += strides[0];
76
+ data[1] += strides[1];
77
+ }
78
+ }
79
+
80
+ // computes the reduction out = op(out, in)
81
+ template <typename func_t, typename vec_func_t>
82
+ static inline void vectorized_inner_reduction(char** data, int64_t n, func_t op, vec_func_t vop) {
83
+ VEC_LOOP_HEADER(func_t, data)
84
+ int64_t vector_stride = 4 * Vec::size() * sizeof(scalar_t);
85
+ int64_t count = n / (4 * Vec::size());
86
+ if (count > 0) {
87
+ vectorized_reduction(data, count, vector_stride, op, vop, /*reduce=*/true);
88
+ }
89
+ char* ptrs[3] = { data[0], data[0], data[1] };
90
+ int64_t strides[] = { 0, 0, sizeof(scalar_t) };
91
+ basic_loop(ptrs, strides, count * 4 * Vec::size(), n, op);
92
+ }
93
+
94
+ // computes the reduction out = op(out, in)
95
+ template <typename func_t, typename vec_func_t>
96
+ static inline void vectorized_outer_reduction(char** data, int64_t inner_stride, int64_t size0, int64_t size1, func_t op, vec_func_t vop) {
97
+ VEC_LOOP_HEADER(func_t, data)
98
+
99
+ // reduce down each column of 4 * Vec::size() elements (128 or 256 bytes)
100
+ #if defined(CPU_CAPABILITY_AVX512)
101
+ int64_t outer_stride[2] = { 256, 256 };
102
+ #else
103
+ int64_t outer_stride[2] = { 128, 128 };
104
+ #endif
105
+ UNARY_OUTER_LOOP(data, outer_stride, size1 / (4 * Vec::size()), [&] {
106
+ vectorized_reduction(data, size0, inner_stride, op, vop, /*reduce=*/false);
107
+ });
108
+
109
+ // reduce down the remaining columns
110
+ int64_t step[] = { sizeof(scalar_t), sizeof(scalar_t) };
111
+ int64_t remaining = size1 % (4 * Vec::size());
112
+ UNARY_OUTER_LOOP(data, step, remaining, [&] {
113
+ char* ptrs[3] = { data[0], data[0], data[1] };
114
+ int64_t strides[] = { 0, 0, inner_stride };
115
+ basic_loop(ptrs, strides, 0, size0, op);
116
+ });
117
+ }
118
+
119
+ template<typename traits, typename res_t>
120
+ static void set_result(const int index, const res_t result, const TensorIteratorBase &iter, const int num_outputs) {
121
+ // static_assert(std::is_same<res_t, typename traits::arg2_t>::value, "data types must match");
122
+ if (index < num_outputs) {
123
+ char *out = (char *) iter.data_ptr(index);
124
+ *(res_t *) out = result;
125
+ }
126
+ }
127
+
128
+ template<typename traits, typename res_t>
129
+ static void set_results(const res_t result, const TensorIteratorBase &iter, const int num_outputs) {
130
+ AT_ASSERT(num_outputs == 1);
131
+ set_result<traits>(0, result, iter, num_outputs);
132
+ }
133
+
134
+ template<typename traits, std::size_t i = 0, typename... tuple_t>
135
+ static inline typename std::enable_if<i == sizeof...(tuple_t), std::size_t>::type
136
+ for_each_in_tuple(const std::tuple<tuple_t...>& /*t*/, const TensorIteratorBase& /*iter*/, const int /*num_outputs*/) {
137
+ return i;
138
+ }
139
+
140
+ template<typename traits, std::size_t i = 0, typename... tuple_t>
141
+ static inline typename std::enable_if<i < sizeof...(tuple_t), std::size_t>::type
142
+ for_each_in_tuple(const std::tuple<tuple_t...>& t, const TensorIteratorBase &iter, const int num_outputs) {
143
+ if (i < (size_t)num_outputs) {
144
+ set_result<traits>(i, std::get<i>(t), iter, num_outputs);
145
+ return for_each_in_tuple<traits, i + 1, tuple_t...>(t, iter, num_outputs);
146
+ }
147
+ return i;
148
+ }
149
+
150
+ template<typename traits, typename... res_t>
151
+ static void set_results(const std::tuple<res_t...>& result, const TensorIteratorBase &iter, const int num_outputs) {
152
+ AT_ASSERT(num_outputs >= 1);
153
+ std::size_t result_size = for_each_in_tuple<traits>(result, iter, num_outputs);
154
+ AT_ASSERT((size_t)num_outputs == result_size);
155
+ }
156
+
157
+ template <typename T, typename... Args>
158
+ struct all_same : std::conjunction<
159
+ std::is_same<T, Args>...
160
+ > {};
161
+
162
+ // data_t is the input/output data type.
163
+ // acc_t is a type that contains all the necessary data
164
+ // to continue reducing.
165
+ // index_t is a one-dimensional index
166
+ //
167
+ // ops_t is such that &ops_t::reduce, &ops_t::combine, and &ops_t::project exist and satisfy
168
+ // the following.
169
+ // reduce: (acc_t, data_t, index_t) -> acc_t adds one data point to the accumulated value.
170
+ // combine: (acc_t, acc_t) -> acc_t combines two accumulated values into one.
171
+ // project: acc_t -> out_t finishes the reduction, getting the required output.
172
+ //
173
+ // Additionally, acc_t must be default-constructible:
174
+ // acc_t {} is an identity for combine,
175
+ // and project(acc_t {}) is the value of the operation on zero elements.
176
+ //
177
+ // The point of `combine` is to support parallelization -
178
+ // the idea is to one sequence of `reduce` calls per thread of execution,
179
+ // and then to combine them at the end with `combine`.
180
+ //
181
+ // If there is more than one output element,
182
+ // our parallelization strategy is to use one thread for each of them,
183
+ // which means that `combine` will never be called.
184
+ //
185
+ // If, on the other hand, there is only one, then we split the input into
186
+ // into several pieces, reduce each separately, and then combine them.
187
+
188
+ template <typename ops_t, typename init_t>
189
+ void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) {
190
+ using rf_t = decltype(&ops_t::reduce);
191
+ using cf_t = decltype(&ops_t::combine);
192
+ using pf_t = decltype(&ops_t::project);
193
+ using r_traits = binary_function_traits<rf_t>;
194
+ using c_traits = binary_function_traits<cf_t>;
195
+ using p_traits = unary_function_traits<pf_t>;
196
+ using acc_t = typename p_traits::arg1_t;
197
+ using data_t = typename r_traits::arg2_t;
198
+ static_assert(
199
+ all_same<
200
+ acc_t,
201
+ init_t,
202
+ typename r_traits::arg1_t,
203
+ typename r_traits::result_type,
204
+ typename c_traits::arg1_t,
205
+ typename c_traits::arg2_t,
206
+ typename c_traits::result_type>::value,
207
+ "all accumulate types must match");
208
+ static_assert(
209
+ std::is_default_constructible<acc_t>::value,
210
+ "the accumulate type must be default-constructible"
211
+ );
212
+ const int num_outputs = iter.noutputs();
213
+ iter.foreach_reduced_elt([&ops, &init, num_outputs](TensorIteratorBase &sub_iter) {
214
+ auto reduction_body = [&ops, &sub_iter, num_outputs](acc_t acc, int64_t begin, int64_t end) -> acc_t {
215
+ int ntensors = sub_iter.ntensors();
216
+ sub_iter.serial_for_each([&acc, &ops, num_outputs, ntensors, begin](char** data, const int64_t* strides, int64_t size) {
217
+ AT_ASSERT(ntensors - num_outputs == 1);
218
+ char *in = data[ntensors - 1];
219
+ int64_t stride = strides[ntensors - 1];
220
+ for (const auto i : c10::irange(size)) {
221
+ acc = ops.reduce(acc, c10::load<data_t>(in), begin + i);
222
+ in += stride;
223
+ }
224
+ }, {begin, end});
225
+ return ops.translate_idx(acc, sub_iter.view_offsets()[0]);
226
+ };
227
+ acc_t total_acc = init;
228
+ auto numel = sub_iter.numel();
229
+ if (numel < at::internal::GRAIN_SIZE || at::get_num_threads() == 1 ||
230
+ at::in_parallel_region()) {
231
+ total_acc = reduction_body(total_acc, 0, numel);
232
+ } else {
233
+ int max_threads = at::get_num_threads();
234
+ AT_ASSERT(max_threads > 0);
235
+ static_assert(
236
+ !std::is_same<acc_t, bool>::value,
237
+ "Concurrently modifying different references into std::vector<bool> is UB."
238
+ );
239
+ std::vector<acc_t> buffer((unsigned)max_threads, init);
240
+ at::parallel_for(0, numel, internal::GRAIN_SIZE,
241
+ [&](int64_t begin, int64_t end) {
242
+ auto& acc = buffer[at::get_thread_num()];
243
+ acc = reduction_body(acc, begin, end);
244
+ }
245
+ );
246
+ for (const auto i : c10::irange(max_threads)) {
247
+ total_acc = ops.combine(total_acc, buffer[i]);
248
+ }
249
+ }
250
+ set_results<r_traits>(ops.project(total_acc), sub_iter, num_outputs);
251
+ });
252
+ }
253
+
254
+ template <typename func_t, typename vec_func_t>
255
+ void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, double ident = 0) {
256
+ using traits = binary_function_traits<func_t>;
257
+ static_assert(
258
+ all_same<
259
+ typename traits::result_type,
260
+ typename traits::arg1_t,
261
+ typename traits::arg2_t>::value,
262
+ "all types must match");
263
+
264
+ iter.output_base().fill_(ident);
265
+ iter.parallel_reduce([&](char** data, const int64_t* strides, int64_t size0, int64_t size1) {
266
+ int64_t outer_strides[] = { strides[2], strides[3] };
267
+ if (is_contiguous_reduction<traits>(strides)) {
268
+ // input is contiguous in dim 0, output is reduced in dim 0
269
+ UNARY_OUTER_LOOP(data, outer_strides, size1, [&] {
270
+ vectorized_inner_reduction(data, size0, op, vop);
271
+ });
272
+ } else if (is_outer_reduction<traits>(strides)) {
273
+ // input and output are contiguous in dim 1
274
+ int64_t inner_stride = strides[1]; // stride of input in dim 0
275
+ vectorized_outer_reduction(data, inner_stride, size0, size1, op, vop);
276
+ } else {
277
+ UNARY_OUTER_LOOP(data, outer_strides, size1, [&] {
278
+ char* ptrs[3] = { data[0], data[0], data[1] };
279
+ int64_t inner_strides[3] = { strides[0], strides[0], strides[1] };
280
+ basic_loop(ptrs, inner_strides, 0, size0, op);
281
+ });
282
+ }
283
+ });
284
+ }
285
+
286
+ // when reduction is on most inner dimension (dim 0 in TensorIterator)
287
+ // and input has contiguous most inner dimension, `binary_kernel_reduce_lastdim`
288
+ // can be used.
289
+ static inline bool is_reduce_lastdim(TensorIteratorBase& iter) {
290
+ return iter.num_reduce_dims() == 1 && iter.is_dim_reduced(0)
291
+ && iter.ninputs() == 1 && iter.strides(1)[0] == iter.element_size(1);
292
+ }
293
+
294
+ template <typename reduce_func_t>
295
+ void binary_kernel_reduce_lastdim(TensorIteratorBase& iter, reduce_func_t reduce_op) {
296
+ auto shape = iter.shape();
297
+ int64_t dim_size = shape[0];
298
+ int64_t grain_size = std::max((int64_t) 1, at::internal::GRAIN_SIZE / dim_size);
299
+ TensorIterator sub_iter(iter);
300
+ // create sub iterator to parallel on all non-reduce-dims
301
+ sub_iter.narrow(0, 0, 1);
302
+ auto loop = [&](char** data, const int64_t* strides, int64_t size) {
303
+ char* out = data[0];
304
+ char* in = data[1];
305
+ for (int64_t i = 0; i < size; ++i) {
306
+ reduce_op(out, in, dim_size);
307
+ out += strides[0];
308
+ in += strides[1];
309
+ }
310
+ };
311
+ sub_iter.for_each(loop, grain_size);
312
+ }
313
+
314
+ }}} // namespace at::native::<anonymous>
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/SerialStackImpl.h ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2004-present Facebook. All Rights Reserved.
2
+ #pragma once
3
+
4
+ #include <ATen/core/Tensor.h>
5
+
6
+ #include <ATen/MemoryOverlap.h>
7
+ #include <ATen/Parallel.h>
8
+ #include <ATen/TensorIterator.h>
9
+ #include <ATen/cpu/vec/functional.h>
10
+ #include <ATen/cpu/vec/vec.h>
11
+ #include <c10/util/irange.h>
12
+
13
+ namespace at { namespace native { namespace detail {
14
+
15
+ struct InputMeta {
16
+ void* data_ptr;
17
+ int64_t inner_size;
18
+
19
+ InputMeta(const Tensor& t, int64_t dim, int64_t inner)
20
+ : data_ptr(t.data_ptr()), inner_size(t.sizes()[dim] * inner) {}
21
+ };
22
+
23
+ // This kernel is used by two TensorList types:
24
+ // 1. stack_serial_kernel uses at::ArrayRef<Tensor>
25
+ // 2. Static runtime calls this kernel directly (csrc/jit/runtime/static/ops.cpp) with
26
+ // ProcessedNodeInputWrapper.
27
+ // When making changes, make sure that they are compatible with both types!
28
+ template <typename scalar_t, typename TensorListType>
29
+ void stack_serial_kernel_impl(Tensor& result, TensorListType tensors, int64_t dim) {
30
+ TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
31
+ dim >= 0 && dim <= result.dim(),
32
+ "dim out of range in stack_serial_kernel_impl");
33
+ int64_t outer =
34
+ result.numel() / (result.sizes()[dim] * result.strides()[dim]);
35
+ scalar_t* result_data = result.data_ptr<scalar_t>();
36
+ int64_t ninputs = tensors.size();
37
+ std::vector<InputMeta> inputs;
38
+ inputs.reserve(ninputs);
39
+ for (const auto& tensor : tensors) {
40
+ inputs.emplace_back(tensor, dim, tensor.strides()[dim]);
41
+ }
42
+
43
+ using Vec = vec::Vectorized<scalar_t>;
44
+ scalar_t* result_ptr = result_data;
45
+ for (const auto i : c10::irange(outer)) {
46
+ for (const auto j : c10::irange(ninputs)) {
47
+ int64_t local_inner = inputs[j].inner_size;
48
+ scalar_t* input_ptr = (scalar_t*)(inputs[j].data_ptr) + i * local_inner;
49
+
50
+ if (local_inner < Vec::size()) {
51
+ for (const auto k : c10::irange(local_inner)) {
52
+ result_ptr[k] = input_ptr[k];
53
+ }
54
+ } else {
55
+ vec::map(
56
+ [](Vec x) { return x; }, result_ptr, input_ptr, local_inner);
57
+ }
58
+ result_ptr += local_inner;
59
+ }
60
+ }
61
+ }
62
+
63
+ // Checks to see whether native stack can be invoked under these conditions:
64
+ // - result and input tensors are contiguous
65
+ // - only one thread is used
66
+ // - no type promotion has to occur
67
+ // - tensors dtype is Double or Float
68
+ template <typename TensorListType>
69
+ bool can_use_native_serial_stack_impl(Tensor& result, TensorListType tensors, int64_t dim) {
70
+ TORCH_CHECK(tensors.size() > 0, "expected a non-empty list of Tensors");
71
+ const Tensor& first_tensor = tensors[0];
72
+ // stack dimension should be in range [0,firstTensor.dim())
73
+ // dim == firstTensor.dim() is a valid input, but it is handled by default code path
74
+ // that uses unsqueeze
75
+ if (dim >= first_tensor.dim()) return false;
76
+ // Native stack doesn't apply any tensor is skipped.
77
+ if (first_tensor.numel() == 0 && first_tensor.dim() == 1) return false;
78
+ // there should be no type promotion
79
+ if (result.dtype() != first_tensor.dtype()) return false;
80
+
81
+ auto first_tensor_mem_format = first_tensor.suggest_memory_format();
82
+ ScalarType dtype = first_tensor.scalar_type();
83
+
84
+ if (!result.is_contiguous(first_tensor_mem_format)) {
85
+ return false;
86
+ }
87
+
88
+ // fast path only works for Double and Float
89
+ if (dtype != ScalarType::Double && dtype != ScalarType::Float) {
90
+ return false;
91
+ }
92
+
93
+ // check remainder of inputs
94
+ auto const &first_tensor_shape = first_tensor.sizes();
95
+ for (const auto i : c10::irange(1, tensors.size())) {
96
+ auto const &tensor = tensors[i];
97
+ TORCH_CHECK(tensors[i].sizes() == first_tensor.sizes(),
98
+ "stack expects each tensor to be equal size, but got ", first_tensor_shape,
99
+ " at entry 0 and ", tensor.sizes(), " at entry ", i);
100
+
101
+ // every tensor must be contiguous
102
+ // tensor sizes and strides must be the same
103
+ // there should be no type promotion
104
+ if (!tensor.is_contiguous(first_tensor_mem_format) ||
105
+ tensor.strides() != first_tensor.strides() ||
106
+ tensor.dtype() != dtype) {
107
+ return false;
108
+ }
109
+ }
110
+
111
+ // fast native stack should only be used when it is not worth using multiple threads
112
+ // or there is only one thread. Note that we aren't checking result.numel() here because
113
+ // it may not have been resized and we want to defer that cost till later.
114
+ int64_t numel_in_stack = first_tensor.numel() * tensors.size();
115
+ return numel_in_stack < at::internal::GRAIN_SIZE || at::get_num_threads() == 1;
116
+ }
117
+
118
+ template <typename TensorListType, bool should_skip_overlap_check>
119
+ struct CanUseNativeSerialStack;
120
+
121
+ template <typename TensorListType>
122
+ struct CanUseNativeSerialStack<TensorListType, false> {
123
+ static bool call(Tensor& result, TensorListType tensors, int64_t dim) {
124
+ // Inputs cannot alias the output tensor
125
+ for (const auto i : c10::irange(tensors.size())) {
126
+ auto lap = at::get_overlap_status(result, tensors[i]);
127
+ TORCH_CHECK(lap != at::MemOverlapStatus::Partial &&
128
+ lap != at::MemOverlapStatus::Full, 0,
129
+ "unsupported operation: the input tensors cannot refer to any of the "
130
+ "output memory locations. Found overlap in input tensor ", i);
131
+ }
132
+
133
+ return can_use_native_serial_stack_impl(result, tensors, dim);
134
+ }
135
+ };
136
+
137
+ template <typename TensorListType>
138
+ struct CanUseNativeSerialStack<TensorListType, true> {
139
+ static bool call(Tensor& result, TensorListType tensors, int64_t dim) {
140
+ return can_use_native_serial_stack_impl(result, tensors, dim);
141
+ }
142
+ };
143
+
144
+ }}} // namespace at::native::detail
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/StackKernel.h ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Copyright 2004-present Facebook. All Rights Reserved.
2
+ #pragma once
3
+
4
+ #include <ATen/core/Tensor.h>
5
+ #include <ATen/native/DispatchStub.h>
6
+
7
+ namespace at { namespace native {
8
+
9
+ using stack_serial_fn = void(*)(Tensor &, TensorList, int64_t);
10
+ DECLARE_DISPATCH(stack_serial_fn, stack_serial_stub);
11
+
12
+ }} // namespace at::native
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/int_mm_kernel.h ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <ATen/core/Tensor.h>
4
+ #include <ATen/native/DispatchStub.h>
5
+
6
+ namespace at::native {
7
+
8
+ using weight_to_int4pack_fn = void(*)(const Tensor&, const Tensor&, int, int);
9
+ using int4pack_mm_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, int, const Tensor&, int, int);
10
+ using int8pack_mm_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&);
11
+
12
+ DECLARE_DISPATCH(weight_to_int4pack_fn, weight_to_int4pack_stub);
13
+ DECLARE_DISPATCH(int4pack_mm_fn, int4pack_mm_stub);
14
+ DECLARE_DISPATCH(int8pack_mm_fn, int8pack_mm_stub);
15
+
16
+ } // namespace at::native
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/mps/MPSGraphVenturaOps.h ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ #include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
3
+
4
+ // TODO: Remove me when moved to MacOS 13
5
+ #if !defined(__MAC_13_2) && \
6
+ (!defined(MAC_OS_X_VERSION_13_2) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_2))
7
+
8
+ @interface FakeMPSGraphConvolution3DOpDescriptor : NSObject<NSCopying>
9
+
10
+ @property (readwrite, nonatomic) NSUInteger strideInX;
11
+ @property (readwrite, nonatomic) NSUInteger strideInY;
12
+ @property (readwrite, nonatomic) NSUInteger strideInZ;
13
+ @property (readwrite, nonatomic) NSUInteger dilationRateInX;
14
+ @property (readwrite, nonatomic) NSUInteger dilationRateInY;
15
+ @property (readwrite, nonatomic) NSUInteger dilationRateInZ;
16
+
17
+ @property (readwrite, nonatomic) NSUInteger paddingLeft;
18
+ @property (readwrite, nonatomic) NSUInteger paddingRight;
19
+ @property (readwrite, nonatomic) NSUInteger paddingTop;
20
+ @property (readwrite, nonatomic) NSUInteger paddingBottom;
21
+ @property (readwrite, nonatomic) NSUInteger paddingFront;
22
+ @property (readwrite, nonatomic) NSUInteger paddingBack;
23
+
24
+ @property (readwrite, nonatomic) MPSGraphPaddingStyle paddingStyle;
25
+ @property (readwrite, nonatomic) MPSGraphTensorNamedDataLayout dataLayout;
26
+ @property (readwrite, nonatomic) MPSGraphTensorNamedDataLayout weightsLayout;
27
+
28
+ @property (readwrite, nonatomic) NSUInteger groups;
29
+
30
+ @end
31
+
32
+ @compatibility_alias MPSGraphConvolution3DOpDescriptor FakeMPSGraphConvolution3DOpDescriptor;
33
+
34
+ #endif
35
+
36
+ @interface MPSGraph (VenturaOps)
37
+
38
+ #if !defined(__MAC_13_0) && \
39
+ (!defined(MAC_OS_X_VERSION_13_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_13_0))
40
+
41
+ typedef NS_ENUM(NSUInteger, MPSGraphResizeNearestRoundingMode)
42
+ {
43
+ MPSGraphResizeNearestRoundingModeRoundPreferCeil = 0L,
44
+ MPSGraphResizeNearestRoundingModeRoundPreferFloor = 1L,
45
+ MPSGraphResizeNearestRoundingModeCeil = 2L,
46
+ MPSGraphResizeNearestRoundingModeFloor = 3L,
47
+ MPSGraphResizeNearestRoundingModeRoundToEven = 4L,
48
+ MPSGraphResizeNearestRoundingModeRoundToOdd = 5L,
49
+ };
50
+
51
+ // Define complex enums for MacOS 12
52
+ #define MPSDataTypeComplexBit 0x01000000
53
+ #define MPSDataTypeComplexFloat32 ((MPSDataType) (MPSDataTypeFloatBit | MPSDataTypeComplexBit | 64))
54
+ #define MPSDataTypeComplexFloat16 ((MPSDataType) (MPSDataTypeFloatBit | MPSDataTypeComplexBit | 32))
55
+ #endif
56
+
57
+ - (MPSGraphTensor * _Nonnull) convolution3DWithSourceTensor:(MPSGraphTensor * _Nonnull) source
58
+ weightsTensor:(MPSGraphTensor * _Nonnull) weights
59
+ descriptor:(MPSGraphConvolution3DOpDescriptor * _Nonnull) descriptor
60
+ name:(NSString * _Nullable) name;
61
+
62
+ - (MPSGraphTensor * _Nonnull) convolution3DDataGradientWithIncomingGradientTensor:(MPSGraphTensor * _Nonnull) incomingGradient
63
+ weightsTensor:(MPSGraphTensor * _Nonnull) weights
64
+ outputShape:(MPSShape * _Nonnull) outputShape
65
+ forwardConvolutionDescriptor:(MPSGraphConvolution3DOpDescriptor * _Nonnull) forwardConvolutionDescriptor
66
+ name:(NSString * _Nullable) name;
67
+
68
+ - (MPSGraphTensor * _Nonnull) convolution3DWeightsGradientWithIncomingGradientTensor:(MPSGraphTensor * _Nonnull) incomingGradient
69
+ sourceTensor:(MPSGraphTensor * _Nonnull) source
70
+ outputShape:(MPSShape * _Nonnull) outputShape
71
+ forwardConvolutionDescriptor:(MPSGraphConvolution3DOpDescriptor * _Nonnull) forwardConvolutionDescriptor
72
+ name:(NSString * _Nullable) name;
73
+
74
+ - (MPSGraphTensor * _Nonnull)cumulativeSumWithTensor:(MPSGraphTensor * _Nonnull)tensor
75
+ axis:(NSInteger)axis
76
+ name:(NSString * _Nullable)name;
77
+
78
+ - (MPSGraphTensor * _Nonnull)sortWithTensor:(MPSGraphTensor * _Nonnull)tensor
79
+ axis:(NSInteger)axis
80
+ name:(NSString * _Nullable)name;
81
+
82
+ - (MPSGraphTensor * _Nonnull) sortWithTensor:(MPSGraphTensor * _Nonnull) tensor
83
+ axis:(NSInteger) axis
84
+ descending:(BOOL) descending
85
+ name:(NSString * _Nullable) name;
86
+
87
+ - (MPSGraphTensor * _Nonnull) sortWithTensor:(MPSGraphTensor * _Nonnull) tensor
88
+ axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
89
+ descending:(BOOL) descending
90
+ name:(NSString * _Nullable) name;
91
+
92
+ - (MPSGraphTensor * _Nonnull) sortWithTensor:(MPSGraphTensor * _Nonnull) tensor
93
+ axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
94
+ name:(NSString * _Nullable) name;
95
+
96
+ - (MPSGraphTensor * _Nonnull)argSortWithTensor:(MPSGraphTensor * _Nonnull)tensor
97
+ axis:(NSInteger)axis
98
+ name:(NSString * _Nullable)name;
99
+
100
+ - (MPSGraphTensor * _Nonnull) argSortWithTensor:(MPSGraphTensor * _Nonnull) tensor
101
+ axis:(NSInteger) axis
102
+ descending:(BOOL) descending
103
+ name:(NSString * _Nullable) name;
104
+
105
+ - (MPSGraphTensor * _Nonnull) argSortWithTensor:(MPSGraphTensor * _Nonnull) tensor
106
+ axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
107
+ descending:(BOOL) descending
108
+ name:(NSString * _Nullable) name;
109
+
110
+ - (MPSGraphTensor * _Nonnull) argSortWithTensor:(MPSGraphTensor * _Nonnull) tensor
111
+ axisTensor:(MPSGraphTensor * _Nonnull) axisTensor
112
+ name:(NSString * _Nullable) name;
113
+
114
+ - (MPSGraphTensor * _Nonnull)inverseOfTensor:(MPSGraphTensor * _Nonnull) inputTensor
115
+ name:(NSString * _Nullable)name;
116
+
117
+ - (MPSGraphTensor * _Nonnull) resizeNearestWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
118
+ sizeTensor:(MPSGraphTensor * _Nonnull) size
119
+ nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
120
+ centerResult:(BOOL) centerResult
121
+ alignCorners:(BOOL) alignCorners
122
+ layout:(MPSGraphTensorNamedDataLayout) layout
123
+ name:(NSString * _Nullable) name;
124
+
125
+ - (MPSGraphTensor * _Nonnull) resizeNearestWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
126
+ sizeTensor:(MPSGraphTensor * _Nonnull) size
127
+ scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
128
+ nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
129
+ layout:(MPSGraphTensorNamedDataLayout) layout
130
+ name:(NSString * _Nullable) name;
131
+
132
+ - (MPSGraphTensor * _Nonnull) resizeBilinearWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
133
+ sizeTensor:(MPSGraphTensor * _Nonnull) size
134
+ centerResult:(BOOL) centerResult
135
+ alignCorners:(BOOL) alignCorners
136
+ layout:(MPSGraphTensorNamedDataLayout) layout
137
+ name:(NSString * _Nullable) name;
138
+
139
+ - (MPSGraphTensor * _Nonnull) resizeBilinearWithTensor:(MPSGraphTensor * _Nonnull) imagesTensor
140
+ sizeTensor:(MPSGraphTensor * _Nonnull) size
141
+ scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
142
+ layout:(MPSGraphTensorNamedDataLayout) layout
143
+ name:(NSString * _Nullable) name;
144
+
145
+ - (MPSGraphTensor * _Nonnull) resizeNearestWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
146
+ input:(MPSGraphTensor * _Nonnull) input
147
+ nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
148
+ centerResult:(BOOL) centerResult
149
+ alignCorners:(BOOL) alignCorners
150
+ layout:(MPSGraphTensorNamedDataLayout) layout
151
+ name:(NSString * _Nullable) name;
152
+
153
+ - (MPSGraphTensor * _Nonnull) resizeNearestWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
154
+ input:(MPSGraphTensor * _Nonnull) input
155
+ scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
156
+ nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
157
+ layout:(MPSGraphTensorNamedDataLayout) layout
158
+ name:(NSString * _Nullable) name;
159
+
160
+ - (MPSGraphTensor * _Nonnull) resizeBilinearWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
161
+ input:(MPSGraphTensor * _Nonnull) input
162
+ centerResult:(BOOL) centerResult
163
+ alignCorners:(BOOL) alignCorners
164
+ layout:(MPSGraphTensorNamedDataLayout) layout
165
+ name:(NSString * _Nullable) name;
166
+
167
+ - (MPSGraphTensor * _Nonnull) resizeBilinearWithGradientTensor:(MPSGraphTensor * _Nonnull) gradient
168
+ input:(MPSGraphTensor * _Nonnull) input
169
+ scaleOffsetTensor:(MPSGraphTensor * _Nonnull) scaleOffset
170
+ layout:(MPSGraphTensorNamedDataLayout) layout
171
+ name:(NSString * _Nullable) name;
172
+
173
+ - (MPSGraphTensor * _Nonnull) sampleGridWithSourceTensor:(MPSGraphTensor * _Nonnull) source
174
+ coordinateTensor:(MPSGraphTensor * _Nonnull) coordinates
175
+ layout:(MPSGraphTensorNamedDataLayout) layout
176
+ normalizeCoordinates:(BOOL) normalizeCoordinates
177
+ relativeCoordinates:(BOOL) relativeCoordinates
178
+ alignCorners:(BOOL) alignCorners
179
+ paddingMode:(MPSGraphPaddingMode) paddingMode
180
+ samplingMode:(MPSGraphResizeMode) samplingMode
181
+ constantValue:(double) constantValue
182
+ name:(NSString * _Nullable) name;
183
+
184
+ - (MPSGraphTensor * _Nonnull) sampleGridWithSourceTensor:(MPSGraphTensor * _Nonnull) source
185
+ coordinateTensor:(MPSGraphTensor * _Nonnull) coordinates
186
+ layout:(MPSGraphTensorNamedDataLayout) layout
187
+ normalizeCoordinates:(BOOL) normalizeCoordinates
188
+ relativeCoordinates:(BOOL) relativeCoordinates
189
+ alignCorners:(BOOL) alignCorners
190
+ paddingMode:(MPSGraphPaddingMode) paddingMode
191
+ nearestRoundingMode:(MPSGraphResizeNearestRoundingMode) nearestRoundingMode
192
+ constantValue:(double) constantValue
193
+ name:(NSString * _Nullable) name;
194
+ - (MPSGraphTensor * _Nonnull) truncateWithTensor:(MPSGraphTensor * _Nonnull) tensor
195
+ name:(NSString * _Nullable) name;
196
+
197
+ @end
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorFactories.h ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ namespace at {
4
+ namespace native {
5
+
6
+ } // namespace native
7
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorMath.h ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <ATen/core/ATen_fwd.h>
4
+ #include <ATen/NestedTensorImpl.h>
5
+ #include <c10/macros/Macros.h>
6
+
7
+ namespace at {
8
+ namespace native {
9
+
10
+ TORCH_API Tensor NestedTensor_to_padded_tensor_generic(
11
+ const Tensor& t,
12
+ double padding,
13
+ OptionalIntArrayRef output_size);
14
+
15
+ template <typename Func>
16
+ Tensor map_nt(const Tensor& nt, Func f) {
17
+ auto* nt_impl = get_nested_tensor_impl(nt);
18
+ const auto& sizes = nt_impl->get_nested_sizes();
19
+ return at::detail::make_tensor<NestedTensorImpl>(f(nt_impl->get_buffer()), sizes);
20
+ }
21
+ template <typename Func>
22
+ Tensor map_nt_binary(const Tensor& nt_1, const Tensor& nt_2, Func f){
23
+ auto* nt_impl_1 = get_nested_tensor_impl(nt_1);
24
+ auto* nt_impl_2 = get_nested_tensor_impl(nt_2);
25
+ const auto& sizes = nt_impl_1->get_nested_sizes();
26
+ return at::detail::make_tensor<NestedTensorImpl>(f(nt_impl_1->get_buffer(), nt_impl_2->get_buffer()), sizes);
27
+ }
28
+
29
+ C10_ALWAYS_INLINE std::pair<int64_t, int64_t> _check_nested_layer_norm_inputs(
30
+ const NestedTensorImpl& input,
31
+ IntArrayRef normalized_shape,
32
+ const Tensor& weight /* optional */,
33
+ const Tensor& bias /* optional */) {
34
+
35
+ const size_t normalized_ndim = normalized_shape.size();
36
+ TORCH_CHECK(
37
+ normalized_ndim >= 1,
38
+ "Expected normalized_shape to be at least 1-dimensional, i.e., ",
39
+ "containing at least one element, but got normalized_shape = ",
40
+ normalized_shape);
41
+ TORCH_CHECK(
42
+ !weight.defined() || weight.sizes().equals(normalized_shape),
43
+ "Expected weight to be of same shape as normalized_shape, but got ",
44
+ "weight of shape ",
45
+ weight.sizes(),
46
+ " and normalized_shape = ",
47
+ normalized_shape);
48
+ TORCH_CHECK(
49
+ !bias.defined() || bias.sizes().equals(normalized_shape),
50
+ "Expected bias to be of same shape as normalized_shape, but got ",
51
+ "bias of shape ",
52
+ bias.sizes(),
53
+ " and normalized_shape = ",
54
+ normalized_shape);
55
+
56
+ // Check that the normalized_shape has the exact same sizes as the last dimensions from the NestedTensor input
57
+ // Also, compute M and N considering the idiosyncracies of NestedTensors
58
+ int64_t N = 1;
59
+ for (const auto i: c10::irange(normalized_ndim)) {
60
+ TORCH_CHECK(
61
+ input.opt_size(-normalized_ndim + i) != c10::nullopt,
62
+ "normalized_shape extends into irregular dimensions for the nested tensor"
63
+ );
64
+ TORCH_CHECK(
65
+ normalized_shape[i] == *input.opt_size(-normalized_ndim + i),
66
+ "The shape at dimension ",
67
+ i,
68
+ "of normalized_shape doesn't match the input"
69
+ );
70
+ N *= normalized_shape[i];
71
+ }
72
+
73
+ const int64_t M = input.numel() / N;
74
+
75
+ return std::make_pair(M, N);
76
+ }
77
+
78
+ Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape);
79
+
80
+ } // namespace native
81
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorTransformerUtils.h ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include <ATen/ATen.h>
2
+
3
+
4
+ namespace at {
5
+ namespace native {
6
+ namespace preprocessing {
7
+
8
+ /**
9
+ * This function will take nested query, key, and value
10
+ * and will preprocess it in order to run with either
11
+ * the flash-attention or efficient-attention kernels.
12
+ * @return A tuple containing all the necessary data for running the fused
13
+ * kernels
14
+ */
15
+ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, int64_t, int64_t, Tensor>
16
+ sdpa_nested_preprocessing(
17
+ const Tensor& query,
18
+ const Tensor& key,
19
+ const Tensor& value);
20
+
21
+ /**
22
+ * This function will take nested query, key, and value, grad_out, and out
23
+ * and will preprocess it in order to run with either
24
+ * the flash-attention or efficient-attention kernels backwards.
25
+ * We use both functions to avoid having to do the same preprocessing
26
+ * for cumulative_sequence_length_q and cumulative_sequence_length_kv
27
+ * @return A tuple containing all the necessary data for running the fused
28
+ * kernels
29
+ */
30
+ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor>
31
+ sdpa_nested_preprocessing_backward(
32
+ const at::Tensor& grad_out_,
33
+ const at::Tensor& query,
34
+ const at::Tensor& key,
35
+ const at::Tensor& value,
36
+ const at::Tensor& out,
37
+ const Tensor& cumulative_sequence_length_q,
38
+ const Tensor& cumulative_sequence_length_kv,
39
+ const int64_t max_seqlen_batch_q,
40
+ const int64_t max_seqlen_batch_kv);
41
+
42
+ } // namespace preprocessing
43
+ } // namespace native
44
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/nested/NestedTensorUtils.h ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ #include <ATen/Dispatch.h>
4
+ #include <ATen/NestedTensorImpl.h>
5
+ #include <ATen/Parallel.h>
6
+ #include <ATen/core/Tensor.h>
7
+ #include <c10/core/DispatchKeySet.h>
8
+ #include <c10/core/TensorImpl.h>
9
+ #include <c10/macros/Macros.h>
10
+ #include <c10/util/Exception.h>
11
+
12
+ #ifndef AT_PER_OPERATOR_HEADERS
13
+
14
+ #include <ATen/Functions.h>
15
+ #include <ATen/NativeFunctions.h>
16
+ #else
17
+ #include <ATen/ops/cat.h>
18
+ #include <ATen/ops/empty.h>
19
+ #include <ATen/ops/ones_native.h>
20
+ #include <ATen/ops/prod.h>
21
+ #include <ATen/ops/stack_native.h>
22
+ #include <ATen/ops/tensor.h>
23
+ #endif
24
+
25
+ #include <utility>
26
+ #include <vector>
27
+
28
+ namespace at {
29
+ namespace native {
30
+ struct NestedTensorImpl;
31
+
32
+ // The following functions are used to construct nested tensors from buffers and
33
+ // metadata.
34
+
35
+ inline at::Tensor wrap_buffer(at::Tensor buffer, at::Tensor nested_sizes) {
36
+ TORCH_CHECK(
37
+ buffer.dim() == 1,
38
+ "Expected given buffer to be 1dim, but got ",
39
+ buffer.dim(),
40
+ " instead.");
41
+ TORCH_CHECK(
42
+ buffer.is_contiguous(), "Expected given buffer to be contiguous.");
43
+ return at::detail::make_tensor<NestedTensorImpl>(
44
+ std::move(buffer), std::move(nested_sizes));
45
+ }
46
+
47
+ // TODO: Figure out if we need a non-moving wrap_buffer()
48
+ inline at::Tensor wrap_buffer(
49
+ at::Tensor buffer,
50
+ at::Tensor nested_sizes,
51
+ at::Tensor nested_strides,
52
+ at::Tensor storage_offsets) {
53
+ TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
54
+ buffer.is_contiguous(), "Given buffer must be contiguous.");
55
+ return at::detail::make_tensor<NestedTensorImpl>(
56
+ std::move(buffer),
57
+ std::move(nested_sizes),
58
+ std::move(nested_strides),
59
+ std::move(storage_offsets));
60
+ }
61
+
62
+ inline at::Tensor get_buffer(const at::Tensor& tensor) {
63
+ return get_nested_tensor_impl(tensor)->get_buffer();
64
+ }
65
+
66
+ /**
67
+ * Create a new nested tensor that is a view of a base nested tensor
68
+ *
69
+ * create_view_tensor calls a specialized constructor that copys the
70
+ * the keys from base onto the new view tensor being created.
71
+ * The storage is shared between the base and the returned view tensor
72
+ *
73
+ * All callers of this helper must:
74
+ * - Only return a view of the input
75
+ * - Must be explicit and define a derivative
76
+ *
77
+ * @param base Base tensor to construct view from.
78
+ * @param nested_sizes View tensors' sizes.
79
+ * @param nested_strides View tensors' strides.
80
+ * @param storage_offsets View tensors' offsets.
81
+ * @return A newly constructed view tensor
82
+ */
83
+ inline at::Tensor create_nested_view_tensor(
84
+ const at::Tensor& base,
85
+ at::Tensor nested_sizes,
86
+ at::Tensor nested_strides,
87
+ at::Tensor storage_offsets) {
88
+ TORCH_INTERNAL_ASSERT(
89
+ base.is_nested(),
90
+ "This function can only be used to create nested tensor views");
91
+ TORCH_INTERNAL_ASSERT(
92
+ c10::impl::tls_local_dispatch_key_set().excluded_.has(
93
+ c10::DispatchKey::AutogradFunctionality),
94
+ "Creating a non differentiable nested tensor view in a CompositeImplicit function is not allowed.");
95
+ return at::detail::make_tensor<NestedTensorImpl>(
96
+ c10::TensorImpl::VIEW,
97
+ base,
98
+ nested_sizes,
99
+ nested_strides,
100
+ storage_offsets);
101
+ }
102
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
103
+
104
+ // Helper functions for getting information about a nested tensor's shape.
105
+
106
+ int64_t get_consistent_last_dim_of_nested_tensor(const NestedTensorImpl& nt);
107
+
108
+ // The sizes of the underlying tensors
109
+ inline std::vector<IntArrayRef> NestedTensor_get_sizes(
110
+ const NestedTensorImpl* self_ptr) {
111
+ int64_t ntensors = self_ptr->size(0);
112
+ std::vector<IntArrayRef> sizes(ntensors);
113
+ if (ntensors == 0) {
114
+ return sizes;
115
+ }
116
+ const Tensor& sizemat = self_ptr->get_nested_sizes();
117
+ int64_t orig_dim = sizemat.size(1);
118
+ // nesting scalars has empty sizes
119
+ if (orig_dim == 0) {
120
+ return sizes;
121
+ }
122
+ const int64_t* sizemat_ptr = sizemat.data_ptr<int64_t>();
123
+
124
+ for (const auto i : c10::irange(ntensors)) {
125
+ sizes[i] = IntArrayRef(sizemat_ptr, sizemat_ptr + orig_dim);
126
+ sizemat_ptr += orig_dim;
127
+ }
128
+ return sizes;
129
+ }
130
+
131
+ TORCH_API std::vector<int64_t> NestedTensor_get_max_size(
132
+ const NestedTensorImpl& nt);
133
+
134
+ std::vector<int64_t> NestedTensor_get_max_size_from_size_tensor(
135
+ const Tensor& sizes);
136
+
137
+ inline std::vector<IntArrayRef> NestedTensor_get_sizes(const at::Tensor& self) {
138
+ const NestedTensorImpl* self_ptr = get_nested_tensor_impl(self);
139
+ return NestedTensor_get_sizes(self_ptr);
140
+ }
141
+ // The strides of the underlying tensors
142
+ inline std::vector<IntArrayRef> NestedTensor_get_strides(
143
+ const NestedTensorImpl* self_ptr) {
144
+ int64_t ntensors = self_ptr->size(0);
145
+ std::vector<IntArrayRef> strides(ntensors);
146
+ if (ntensors == 0) {
147
+ return strides;
148
+ }
149
+ const Tensor& stridemat = self_ptr->get_nested_strides();
150
+ int64_t orig_dim = stridemat.size(1);
151
+ // nesting scalars has empty strides
152
+ if (orig_dim == 0) {
153
+ return strides;
154
+ }
155
+ const int64_t* stridemat_ptr = stridemat.data_ptr<int64_t>();
156
+ for (const auto i : c10::irange(ntensors)) {
157
+ strides[i] = IntArrayRef(stridemat_ptr, stridemat_ptr + orig_dim);
158
+ stridemat_ptr += orig_dim;
159
+ }
160
+ return strides;
161
+ }
162
+
163
+ inline std::vector<IntArrayRef> NestedTensor_get_strides(
164
+ const at::Tensor& self) {
165
+ const NestedTensorImpl* self_ptr = get_nested_tensor_impl(self);
166
+ return NestedTensor_get_strides(self_ptr);
167
+ }
168
+
169
+ inline void check_numel_equals_buffer_size(const at::Tensor& self) {
170
+ auto self_impl = get_nested_tensor_impl(self);
171
+ TORCH_CHECK(
172
+ self.numel() == static_cast<int64_t>(self_impl->get_buffer_size()),
173
+ "Number of elements in nested tensor must match number of elements in buffer.");
174
+ }
175
+
176
+ inline void check_numel_equals_buffer_size(const NestedTensorImpl* self_ptr) {
177
+ TORCH_CHECK(
178
+ self_ptr->numel() == static_cast<int64_t>(self_ptr->get_buffer_size()),
179
+ "Number of elements in nested tensor must match number of elements in buffer.");
180
+ }
181
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
182
+ // Data structures and functions for generically applying a function on a nested
183
+ // tensor.
184
+ namespace impl {
185
+
186
+ template <typename T>
187
+ struct NestedNode {
188
+ NestedNode() = delete;
189
+ explicit NestedNode(std::vector<T>&& children)
190
+ : _is_leaf(false), _children(children) {}
191
+ explicit NestedNode(TensorList children)
192
+ : _is_leaf(false), _children(children.vec()) {}
193
+ // NestedNode(NestedNode&) = delete;
194
+ // NestedNode(const NestedNode&) = delete;
195
+ // NestedNode& operator=(NestedNode) = delete;
196
+ explicit NestedNode(T payload) : _is_leaf(true), _payload(std::move(payload)) {}
197
+ inline bool is_leaf() const {
198
+ return _is_leaf;
199
+ }
200
+ inline size_t degree() const {
201
+ return _children.size();
202
+ }
203
+ inline const std::vector<T> unbind() const {
204
+ return _children;
205
+ }
206
+ inline T children(size_t i) const {
207
+ return _children[i];
208
+ }
209
+ inline const T& payload() const {
210
+ return _payload;
211
+ }
212
+ inline T& payload() {
213
+ return _payload;
214
+ }
215
+
216
+ private:
217
+ bool _is_leaf;
218
+ std::vector<T> _children;
219
+ T _payload;
220
+ };
221
+
222
+ using TensorNode = NestedNode<at::Tensor>;
223
+
224
+ template <class F, class A, class TypeList>
225
+ class _map;
226
+
227
+ template <class F, class A, class... Args>
228
+ class _map<F, A, c10::guts::typelist::typelist<Args...>> {
229
+ public:
230
+ static A function_one(F&& fn, const Args&... nested_node) {
231
+ return std::forward<F>(fn)(nested_node...);
232
+ }
233
+ // NOTE: We must move F to avoid copying objects if it is a lambda with
234
+ // captures.
235
+ static NestedNode<A> function(
236
+ F&& fn,
237
+ const NestedNode<Args>&... nested_node) {
238
+ size_t degree = 0;
239
+ bool all_leaf = true;
240
+ c10::guts::tuple_map(
241
+ std::forward_as_tuple(nested_node...), [&all_leaf, &degree](auto n) {
242
+ all_leaf = all_leaf && (n.is_leaf());
243
+ if (degree > 1 && n.degree() > 1) {
244
+ TORCH_CHECK(
245
+ degree == n.degree(), "NestedNodes must match in degree.");
246
+ }
247
+ if (n.degree() > degree) {
248
+ degree = n.degree();
249
+ }
250
+ return nullptr;
251
+ });
252
+ // All NestedNodes just wrap regular objects.
253
+ if (all_leaf) {
254
+ return NestedNode<A>(std::forward<F>(fn)(nested_node.payload()...));
255
+ }
256
+ // Some NestedNodes wrap regular Tensors, some NestedTensors and some other
257
+ // types.
258
+ std::vector<A> result;
259
+ for (size_t i = 0; i < degree; i++) {
260
+ std::tuple<Args...> children = c10::guts::tuple_map(
261
+ std::forward_as_tuple(nested_node...), [&i](auto a) {
262
+ static_assert(
263
+ c10::guts::is_instantiation_of<NestedNode, decltype(a)>::value,
264
+ "Internal error.");
265
+ // Broadcast regular arguments across NestedTensor constituents.
266
+ // This could be a Tensor, integer or anything else really.
267
+ if (a.is_leaf()) {
268
+ return a.payload();
269
+ }
270
+ // Broadcast NestedTensors with one constituent.
271
+ if (a.degree() == 1 && !a.is_leaf()) {
272
+ return a.children(0);
273
+ }
274
+ TORCH_CHECK(a.degree() > 0, "Internal assert.");
275
+ return a.children(i);
276
+ });
277
+ c10::guts::apply(
278
+ [&result, &fn](Args... filtered) {
279
+ result.emplace_back(function_one(std::forward<F>(fn), filtered...));
280
+ },
281
+ std::move(children));
282
+ }
283
+ return NestedNode<A>(std::move(result));
284
+ }
285
+ };
286
+
287
+ // TODO: Add static assert to verify lambda arguments match nested_node types
288
+ template <class F, class... B>
289
+ static inline NestedNode<
290
+ typename c10::guts::infer_function_traits<F>::type::return_type>
291
+ map(F&& fn, const NestedNode<B>&... nested_node) {
292
+ return _map<
293
+ F,
294
+ typename c10::guts::infer_function_traits<F>::type::return_type,
295
+ typename c10::guts::infer_function_traits<F>::type::parameter_types>::
296
+ function(std::forward<F>(fn), nested_node...);
297
+ }
298
+
299
+ inline TensorNode get_nested_tensor_structure(at::Tensor tensor) {
300
+ if (get_nested_tensor_impl_or_null(tensor) == nullptr) {
301
+ return TensorNode(std::move(tensor));
302
+ }
303
+ return TensorNode(tensor.unbind());
304
+ }
305
+
306
+ inline Tensor wrap_tensor_node(
307
+ TensorNode tensor_node,
308
+ c10::optional<ScalarType> dtype,
309
+ c10::optional<Layout> layout,
310
+ c10::optional<Device> device,
311
+ c10::optional<bool> pin_memory) {
312
+ TORCH_CHECK(
313
+ !tensor_node.is_leaf(), "Expected TensorNode to wrap a list of Tensors.");
314
+ TensorOptions options_ =
315
+ TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(
316
+ pin_memory);
317
+ if (tensor_node.degree() == 0) {
318
+ return wrap_buffer(ones({0}, dtype, layout, device), ones({}));
319
+ }
320
+
321
+ // Fast path: if all tensors are on CPU, have contiguous memory, and the same
322
+ // dtype, copying can be done much faster.
323
+ bool all_tensors_cpu = true;
324
+ bool all_tensors_contiguous = true;
325
+ bool all_tensors_same_dtype = true;
326
+ auto first_dtype = tensor_node.children(0).dtype();
327
+ std::vector<long> start_offsets(tensor_node.degree());
328
+ start_offsets[0] = 0;
329
+ long total_size = 0;
330
+ for (const auto i : c10::irange(tensor_node.degree())) {
331
+ all_tensors_cpu = all_tensors_cpu && tensor_node.children(i).is_cpu();
332
+ all_tensors_contiguous =
333
+ all_tensors_contiguous && tensor_node.children(i).is_contiguous();
334
+ all_tensors_same_dtype = all_tensors_same_dtype &&
335
+ (first_dtype == tensor_node.children(i).dtype());
336
+ if (!(all_tensors_cpu && all_tensors_contiguous &&
337
+ all_tensors_same_dtype)) {
338
+ break;
339
+ }
340
+ if (i > 0) {
341
+ start_offsets[i] =
342
+ start_offsets[i - 1] + tensor_node.children(i - 1).numel();
343
+ }
344
+ total_size += tensor_node.children(i).numel();
345
+ }
346
+
347
+ TensorOptions options;
348
+ Tensor nt_buffer, nt_sizes;
349
+ if (all_tensors_cpu && all_tensors_contiguous && all_tensors_same_dtype) {
350
+ nt_buffer = at::empty({total_size}, tensor_node.children(0).options());
351
+ nt_sizes = at::empty(
352
+ {static_cast<long>(tensor_node.degree()),
353
+ static_cast<long>(tensor_node.children(0).sizes().size())},
354
+ TensorOptions().dtype(kLong));
355
+ AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
356
+ at::ScalarType::Half,
357
+ at::ScalarType::Bool,
358
+ at::ScalarType::BFloat16,
359
+ c10::typeMetaToScalarType(first_dtype),
360
+ "create_nt_buffer",
361
+ [&]() {
362
+ at::parallel_for(
363
+ 0, tensor_node.degree(), 1, [&](int64_t begin, int64_t end) {
364
+ for (int64_t i = begin; i < end; ++i) {
365
+ // Only try copying memory if there is more than 0 elements
366
+ // for a certain tensor
367
+ if (tensor_node.children(i).numel() > 0) {
368
+ memcpy(
369
+ nt_buffer.mutable_data_ptr<scalar_t>() + start_offsets[i],
370
+ tensor_node.children(i).const_data_ptr<scalar_t>(),
371
+ tensor_node.children(i).numel() * sizeof(scalar_t));
372
+ }
373
+ }
374
+ });
375
+ });
376
+ long sizes_offset = 0;
377
+ for (size_t i = 0; i < tensor_node.degree(); ++i) {
378
+ auto tensor_sizes = tensor_node.children(i).sizes();
379
+ for (int64_t tensor_size : tensor_sizes) {
380
+ nt_sizes.mutable_data_ptr<int64_t>()[sizes_offset++] = tensor_size;
381
+ }
382
+ }
383
+ options = nt_buffer.options().merge_in(options_);
384
+ } else { // Slow path
385
+ std::vector<Tensor> flat_tensors;
386
+ std::vector<Tensor> sizes;
387
+ for (const auto i : c10::irange(tensor_node.degree())) {
388
+ flat_tensors.push_back(tensor_node.children(i).reshape(-1).contiguous());
389
+ sizes.push_back(
390
+ tensor(c10::IntArrayRef(tensor_node.children(i).sizes())));
391
+ }
392
+ options = flat_tensors[0].options().merge_in(options_);
393
+ nt_buffer = at::cat(flat_tensors);
394
+ nt_sizes = at::native::stack(sizes);
395
+ }
396
+
397
+ return wrap_buffer(nt_buffer.to(options), nt_sizes);
398
+ }
399
+
400
+ } // namespace impl
401
+
402
+ // This function is meant to ease rapid operator coverage for
403
+ // NestedTensor kernels. It is not meant to be efficient. Use it judiciously.
404
+ template <class F, class... A>
405
+ inline at::Tensor map_nested_tensor(F&& fn, A... a) {
406
+ return wrap_tensor_node(
407
+ impl::map(std::forward<F>(fn), impl::get_nested_tensor_structure(a)...),
408
+ c10::nullopt,
409
+ c10::nullopt,
410
+ c10::nullopt,
411
+ c10::nullopt);
412
+ }
413
+
414
+ } // namespace native
415
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_add_relu_meta_dispatch.h ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace meta {
19
+
20
+ TORCH_API at::Tensor & _add_relu_(at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha=1);
21
+ TORCH_API at::Tensor & _add_relu_(at::Tensor & self, const at::Scalar & other, const at::Scalar & alpha=1);
22
+
23
+ } // namespace meta
24
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cslt_compress_cuda_dispatch.h ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace cuda {
19
+
20
+ TORCH_API at::Tensor _cslt_compress(const at::Tensor & input);
21
+
22
+ } // namespace cuda
23
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fft_c2r.h ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Function.h
4
+
5
+ #include <ATen/Context.h>
6
+ #include <ATen/DeviceGuard.h>
7
+ #include <ATen/TensorUtils.h>
8
+ #include <ATen/TracerMode.h>
9
+ #include <ATen/core/Generator.h>
10
+ #include <ATen/core/Reduction.h>
11
+ #include <ATen/core/Tensor.h>
12
+ #include <c10/core/Scalar.h>
13
+ #include <c10/core/Storage.h>
14
+ #include <c10/core/TensorOptions.h>
15
+ #include <c10/util/Deprecated.h>
16
+ #include <c10/util/Optional.h>
17
+
18
+
19
+
20
+ #include <ATen/ops/_fft_c2r_ops.h>
21
+
22
+ namespace at {
23
+
24
+
25
+ // aten::_fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
26
+ inline at::Tensor _fft_c2r(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
27
+ return at::_ops::_fft_c2r::call(self, dim, normalization, last_dim_size);
28
+ }
29
+ namespace symint {
30
+ template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
31
+ at::Tensor _fft_c2r(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
32
+ return at::_ops::_fft_c2r::call(self, dim, normalization, last_dim_size);
33
+ }
34
+ }
35
+
36
+ // aten::_fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor
37
+ inline at::Tensor _fft_c2r_symint(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size) {
38
+ return at::_ops::_fft_c2r::call(self, dim, normalization, last_dim_size);
39
+ }
40
+ namespace symint {
41
+ template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
42
+ at::Tensor _fft_c2r(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size) {
43
+ return at::_ops::_fft_c2r::call(self, dim, normalization, last_dim_size);
44
+ }
45
+ }
46
+
47
+ // aten::_fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
48
+ inline at::Tensor & _fft_c2r_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
49
+ return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
50
+ }
51
+ namespace symint {
52
+ template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
53
+ at::Tensor & _fft_c2r_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size) {
54
+ return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
55
+ }
56
+ }
57
+
58
+ // aten::_fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
59
+ inline at::Tensor & _fft_c2r_outf(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size, at::Tensor & out) {
60
+ return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
61
+ }
62
+ namespace symint {
63
+ template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
64
+ at::Tensor & _fft_c2r_outf(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, int64_t last_dim_size, at::Tensor & out) {
65
+ return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
66
+ }
67
+ }
68
+
69
+ // aten::_fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
70
+ inline at::Tensor & _fft_c2r_symint_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size) {
71
+ return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
72
+ }
73
+ namespace symint {
74
+ template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
75
+ at::Tensor & _fft_c2r_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size) {
76
+ return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
77
+ }
78
+ }
79
+
80
+ // aten::_fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!)
81
+ inline at::Tensor & _fft_c2r_symint_outf(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size, at::Tensor & out) {
82
+ return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
83
+ }
84
+ namespace symint {
85
+ template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
86
+ at::Tensor & _fft_c2r_outf(const at::Tensor & self, at::IntArrayRef dim, int64_t normalization, c10::SymInt last_dim_size, at::Tensor & out) {
87
+ return at::_ops::_fft_c2r_out::call(self, dim, normalization, last_dim_size, out);
88
+ }
89
+ }
90
+
91
+ }
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_addcdiv_compositeexplicitautograd_dispatch.h ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace compositeexplicitautograd {
19
+
20
+ TORCH_API void _foreach_addcdiv_out(at::TensorList out, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1);
21
+ TORCH_API void _foreach_addcdiv_outf(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value, at::TensorList out);
22
+ TORCH_API void _foreach_addcdiv_out(at::TensorList out, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars);
23
+ TORCH_API void _foreach_addcdiv_outf(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef<at::Scalar> scalars, at::TensorList out);
24
+ TORCH_API void _foreach_addcdiv_out(at::TensorList out, at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars);
25
+ TORCH_API void _foreach_addcdiv_outf(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars, at::TensorList out);
26
+
27
+ } // namespace compositeexplicitautograd
28
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_cosh_ops.h ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Operator.h
4
+
5
+ #include <tuple>
6
+ #include <vector>
7
+
8
+ // Forward declarations of any types needed in the operator signatures.
9
+ // We can't directly include these classes because it will cause circular include dependencies.
10
+ // This file is included by TensorBody.h, which defines the Tensor class.
11
+ #include <ATen/core/ATen_fwd.h>
12
+
13
+ namespace at {
14
+ namespace _ops {
15
+
16
+
17
+ struct TORCH_API _foreach_cosh {
18
+ using schema = ::std::vector<at::Tensor> (at::TensorList);
19
+ using ptr_schema = schema*;
20
+ // See Note [static constexpr char* members for windows NVCC]
21
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_cosh")
22
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
23
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_cosh(Tensor[] self) -> Tensor[]")
24
+ static ::std::vector<at::Tensor> call(at::TensorList self);
25
+ static ::std::vector<at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self);
26
+ };
27
+
28
+ struct TORCH_API _foreach_cosh_ {
29
+ using schema = void (at::TensorList);
30
+ using ptr_schema = schema*;
31
+ // See Note [static constexpr char* members for windows NVCC]
32
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_cosh_")
33
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
34
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_cosh_(Tensor(a!)[] self) -> ()")
35
+ static void call(at::TensorList self);
36
+ static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self);
37
+ };
38
+
39
+ struct TORCH_API _foreach_cosh_out {
40
+ using schema = void (at::TensorList, at::TensorList);
41
+ using ptr_schema = schema*;
42
+ // See Note [static constexpr char* members for windows NVCC]
43
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_cosh")
44
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
45
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_cosh.out(Tensor[] self, *, Tensor(a!)[] out) -> ()")
46
+ static void call(at::TensorList self, at::TensorList out);
47
+ static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out);
48
+ };
49
+
50
+ }} // namespace at::_ops
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_erf.h ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Function.h
4
+
5
+ #include <ATen/Context.h>
6
+ #include <ATen/DeviceGuard.h>
7
+ #include <ATen/TensorUtils.h>
8
+ #include <ATen/TracerMode.h>
9
+ #include <ATen/core/Generator.h>
10
+ #include <ATen/core/Reduction.h>
11
+ #include <ATen/core/Tensor.h>
12
+ #include <c10/core/Scalar.h>
13
+ #include <c10/core/Storage.h>
14
+ #include <c10/core/TensorOptions.h>
15
+ #include <c10/util/Deprecated.h>
16
+ #include <c10/util/Optional.h>
17
+
18
+
19
+
20
+ #include <ATen/ops/_foreach_erf_ops.h>
21
+
22
+ namespace at {
23
+
24
+
25
+ // aten::_foreach_erf(Tensor[] self) -> Tensor[]
26
+ inline ::std::vector<at::Tensor> _foreach_erf(at::TensorList self) {
27
+ return at::_ops::_foreach_erf::call(self);
28
+ }
29
+
30
+ // aten::_foreach_erf_(Tensor(a!)[] self) -> ()
31
+ inline void _foreach_erf_(at::TensorList self) {
32
+ return at::_ops::_foreach_erf_::call(self);
33
+ }
34
+
35
+ // aten::_foreach_erf.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
36
+ inline void _foreach_erf_out(at::TensorList out, at::TensorList self) {
37
+ return at::_ops::_foreach_erf_out::call(self, out);
38
+ }
39
+ // aten::_foreach_erf.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
40
+ inline void _foreach_erf_outf(at::TensorList self, at::TensorList out) {
41
+ return at::_ops::_foreach_erf_out::call(self, out);
42
+ }
43
+
44
+ }
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_minimum.h ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Function.h
4
+
5
+ #include <ATen/Context.h>
6
+ #include <ATen/DeviceGuard.h>
7
+ #include <ATen/TensorUtils.h>
8
+ #include <ATen/TracerMode.h>
9
+ #include <ATen/core/Generator.h>
10
+ #include <ATen/core/Reduction.h>
11
+ #include <ATen/core/Tensor.h>
12
+ #include <c10/core/Scalar.h>
13
+ #include <c10/core/Storage.h>
14
+ #include <c10/core/TensorOptions.h>
15
+ #include <c10/util/Deprecated.h>
16
+ #include <c10/util/Optional.h>
17
+
18
+
19
+
20
+ #include <ATen/ops/_foreach_minimum_ops.h>
21
+
22
+ namespace at {
23
+
24
+
25
+ // aten::_foreach_minimum.Scalar(Tensor[] self, Scalar scalar) -> Tensor[]
26
+ inline ::std::vector<at::Tensor> _foreach_minimum(at::TensorList self, const at::Scalar & scalar) {
27
+ return at::_ops::_foreach_minimum_Scalar::call(self, scalar);
28
+ }
29
+
30
+ // aten::_foreach_minimum_.Scalar(Tensor(a!)[] self, Scalar scalar) -> ()
31
+ inline void _foreach_minimum_(at::TensorList self, const at::Scalar & scalar) {
32
+ return at::_ops::_foreach_minimum__Scalar::call(self, scalar);
33
+ }
34
+
35
+ // aten::_foreach_minimum.List(Tensor[] self, Tensor[] other) -> Tensor[]
36
+ inline ::std::vector<at::Tensor> _foreach_minimum(at::TensorList self, at::TensorList other) {
37
+ return at::_ops::_foreach_minimum_List::call(self, other);
38
+ }
39
+
40
+ // aten::_foreach_minimum_.List(Tensor(a!)[] self, Tensor[] other) -> ()
41
+ inline void _foreach_minimum_(at::TensorList self, at::TensorList other) {
42
+ return at::_ops::_foreach_minimum__List::call(self, other);
43
+ }
44
+
45
+ // aten::_foreach_minimum.ScalarList(Tensor[] self, Scalar[] scalars) -> Tensor[]
46
+ inline ::std::vector<at::Tensor> _foreach_minimum(at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
47
+ return at::_ops::_foreach_minimum_ScalarList::call(self, scalars);
48
+ }
49
+
50
+ // aten::_foreach_minimum_.ScalarList(Tensor(a!)[] self, Scalar[] scalars) -> ()
51
+ inline void _foreach_minimum_(at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
52
+ return at::_ops::_foreach_minimum__ScalarList::call(self, scalars);
53
+ }
54
+
55
+ // aten::_foreach_minimum.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
56
+ inline void _foreach_minimum_out(at::TensorList out, at::TensorList self, const at::Scalar & scalar) {
57
+ return at::_ops::_foreach_minimum_Scalar_out::call(self, scalar, out);
58
+ }
59
+ // aten::_foreach_minimum.Scalar_out(Tensor[] self, Scalar scalar, *, Tensor(a!)[] out) -> ()
60
+ inline void _foreach_minimum_outf(at::TensorList self, const at::Scalar & scalar, at::TensorList out) {
61
+ return at::_ops::_foreach_minimum_Scalar_out::call(self, scalar, out);
62
+ }
63
+
64
+ // aten::_foreach_minimum.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
65
+ inline void _foreach_minimum_out(at::TensorList out, at::TensorList self, at::TensorList other) {
66
+ return at::_ops::_foreach_minimum_List_out::call(self, other, out);
67
+ }
68
+ // aten::_foreach_minimum.List_out(Tensor[] self, Tensor[] other, *, Tensor(a!)[] out) -> ()
69
+ inline void _foreach_minimum_outf(at::TensorList self, at::TensorList other, at::TensorList out) {
70
+ return at::_ops::_foreach_minimum_List_out::call(self, other, out);
71
+ }
72
+
73
+ // aten::_foreach_minimum.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
74
+ inline void _foreach_minimum_out(at::TensorList out, at::TensorList self, at::ArrayRef<at::Scalar> scalars) {
75
+ return at::_ops::_foreach_minimum_ScalarList_out::call(self, scalars, out);
76
+ }
77
+ // aten::_foreach_minimum.ScalarList_out(Tensor[] self, Scalar[] scalars, *, Tensor(a!)[] out) -> ()
78
+ inline void _foreach_minimum_outf(at::TensorList self, at::ArrayRef<at::Scalar> scalars, at::TensorList out) {
79
+ return at::_ops::_foreach_minimum_ScalarList_out::call(self, scalars, out);
80
+ }
81
+
82
+ }
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_trunc.h ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Function.h
4
+
5
+ #include <ATen/Context.h>
6
+ #include <ATen/DeviceGuard.h>
7
+ #include <ATen/TensorUtils.h>
8
+ #include <ATen/TracerMode.h>
9
+ #include <ATen/core/Generator.h>
10
+ #include <ATen/core/Reduction.h>
11
+ #include <ATen/core/Tensor.h>
12
+ #include <c10/core/Scalar.h>
13
+ #include <c10/core/Storage.h>
14
+ #include <c10/core/TensorOptions.h>
15
+ #include <c10/util/Deprecated.h>
16
+ #include <c10/util/Optional.h>
17
+
18
+
19
+
20
+ #include <ATen/ops/_foreach_trunc_ops.h>
21
+
22
+ namespace at {
23
+
24
+
25
+ // aten::_foreach_trunc(Tensor[] self) -> Tensor[]
26
+ inline ::std::vector<at::Tensor> _foreach_trunc(at::TensorList self) {
27
+ return at::_ops::_foreach_trunc::call(self);
28
+ }
29
+
30
+ // aten::_foreach_trunc_(Tensor(a!)[] self) -> ()
31
+ inline void _foreach_trunc_(at::TensorList self) {
32
+ return at::_ops::_foreach_trunc_::call(self);
33
+ }
34
+
35
+ // aten::_foreach_trunc.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
36
+ inline void _foreach_trunc_out(at::TensorList out, at::TensorList self) {
37
+ return at::_ops::_foreach_trunc_out::call(self, out);
38
+ }
39
+ // aten::_foreach_trunc.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
40
+ inline void _foreach_trunc_outf(at::TensorList self, at::TensorList out) {
41
+ return at::_ops::_foreach_trunc_out::call(self, out);
42
+ }
43
+
44
+ }
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fw_primal.h ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Function.h
4
+
5
+ #include <ATen/Context.h>
6
+ #include <ATen/DeviceGuard.h>
7
+ #include <ATen/TensorUtils.h>
8
+ #include <ATen/TracerMode.h>
9
+ #include <ATen/core/Generator.h>
10
+ #include <ATen/core/Reduction.h>
11
+ #include <ATen/core/Tensor.h>
12
+ #include <c10/core/Scalar.h>
13
+ #include <c10/core/Storage.h>
14
+ #include <c10/core/TensorOptions.h>
15
+ #include <c10/util/Deprecated.h>
16
+ #include <c10/util/Optional.h>
17
+
18
+
19
+
20
+ #include <ATen/ops/_fw_primal_ops.h>
21
+
22
+ namespace at {
23
+
24
+
25
+
26
+ }
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_int_mm_cuda_dispatch.h ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace cuda {
19
+
20
+ TORCH_API at::Tensor _int_mm(const at::Tensor & self, const at::Tensor & mat2);
21
+ TORCH_API at::Tensor & _int_mm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mat2);
22
+ TORCH_API at::Tensor & _int_mm_outf(const at::Tensor & self, const at::Tensor & mat2, at::Tensor & out);
23
+
24
+ } // namespace cuda
25
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_compositeexplicitautograd_dispatch.h ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace compositeexplicitautograd {
19
+
20
+ TORCH_API at::Tensor & _masked_softmax_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, c10::optional<int64_t> dim=c10::nullopt, c10::optional<int64_t> mask_type=c10::nullopt);
21
+ TORCH_API at::Tensor & _masked_softmax_outf(const at::Tensor & self, const at::Tensor & mask, c10::optional<int64_t> dim, c10::optional<int64_t> mask_type, at::Tensor & out);
22
+
23
+ } // namespace compositeexplicitautograd
24
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_view_from_buffer_cpu_dispatch.h ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace cpu {
19
+
20
+ TORCH_API at::Tensor _nested_view_from_buffer(const at::Tensor & self, const at::Tensor & nested_size, const at::Tensor & nested_strides, const at::Tensor & offsets);
21
+
22
+ } // namespace cpu
23
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_remove_batch_dim.h ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Function.h
4
+
5
+ #include <ATen/Context.h>
6
+ #include <ATen/DeviceGuard.h>
7
+ #include <ATen/TensorUtils.h>
8
+ #include <ATen/TracerMode.h>
9
+ #include <ATen/core/Generator.h>
10
+ #include <ATen/core/Reduction.h>
11
+ #include <ATen/core/Tensor.h>
12
+ #include <c10/core/Scalar.h>
13
+ #include <c10/core/Storage.h>
14
+ #include <c10/core/TensorOptions.h>
15
+ #include <c10/util/Deprecated.h>
16
+ #include <c10/util/Optional.h>
17
+
18
+
19
+
20
+ #include <ATen/ops/_remove_batch_dim_ops.h>
21
+
22
+ namespace at {
23
+
24
+
25
+ // aten::_remove_batch_dim(Tensor self, int level, int batch_size, int out_dim) -> Tensor
26
+ inline at::Tensor _remove_batch_dim(const at::Tensor & self, int64_t level, int64_t batch_size, int64_t out_dim) {
27
+ return at::_ops::_remove_batch_dim::call(self, level, batch_size, out_dim);
28
+ }
29
+
30
+ }
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_segment_reduce_backward_cpu_dispatch.h ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace cpu {
19
+
20
+ TORCH_API at::Tensor _segment_reduce_backward(const at::Tensor & grad, const at::Tensor & output, const at::Tensor & data, c10::string_view reduce, const c10::optional<at::Tensor> & lengths={}, const c10::optional<at::Tensor> & offsets={}, int64_t axis=0, const c10::optional<at::Scalar> & initial=c10::nullopt);
21
+
22
+ } // namespace cpu
23
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_bsc_tensor_unsafe_native.h ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from NativeFunction.h
4
+
5
+ #include <c10/core/Scalar.h>
6
+ #include <c10/core/Storage.h>
7
+ #include <c10/core/TensorOptions.h>
8
+ #include <c10/util/Deprecated.h>
9
+ #include <c10/util/Optional.h>
10
+ #include <c10/core/QScheme.h>
11
+ #include <ATen/core/Reduction.h>
12
+ #include <ATen/core/Tensor.h>
13
+ #include <tuple>
14
+ #include <vector>
15
+
16
+
17
+ namespace at {
18
+ namespace native {
19
+ TORCH_API at::Tensor _sparse_bsc_tensor_unsafe(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype={}, c10::optional<at::Layout> layout={}, c10::optional<at::Device> device={}, c10::optional<bool> pin_memory={});
20
+ } // namespace native
21
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe_compositeimplicitautograd_dispatch.h ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace compositeimplicitautograd {
19
+
20
+ TORCH_API at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={});
21
+ TORCH_API at::Tensor _sparse_compressed_tensor_unsafe(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
22
+ TORCH_API at::Tensor _sparse_compressed_tensor_unsafe_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options={});
23
+ TORCH_API at::Tensor _sparse_compressed_tensor_unsafe_symint(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
24
+
25
+ } // namespace compositeimplicitautograd
26
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_thnn_differentiable_lstm_cell_backward_native.h ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from NativeFunction.h
4
+
5
+ #include <c10/core/Scalar.h>
6
+ #include <c10/core/Storage.h>
7
+ #include <c10/core/TensorOptions.h>
8
+ #include <c10/util/Deprecated.h>
9
+ #include <c10/util/Optional.h>
10
+ #include <c10/core/QScheme.h>
11
+ #include <ATen/core/Reduction.h>
12
+ #include <ATen/core/Tensor.h>
13
+ #include <tuple>
14
+ #include <vector>
15
+
16
+
17
+ namespace at {
18
+ namespace native {
19
+ TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _thnn_differentiable_lstm_cell_backward(const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const c10::optional<at::Tensor> & input_bias, const c10::optional<at::Tensor> & hidden_bias, const at::Tensor & cx, const at::Tensor & cy);
20
+ } // namespace native
21
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_bsr_ops.h ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Operator.h
4
+
5
+ #include <tuple>
6
+ #include <vector>
7
+
8
+ // Forward declarations of any types needed in the operator signatures.
9
+ // We can't directly include these classes because it will cause circular include dependencies.
10
+ // This file is included by TensorBody.h, which defines the Tensor class.
11
+ #include <ATen/core/ATen_fwd.h>
12
+
13
+ namespace at {
14
+ namespace _ops {
15
+
16
+
17
+ struct TORCH_API _to_sparse_bsr {
18
+ using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, c10::optional<int64_t>);
19
+ using ptr_schema = schema*;
20
+ // See Note [static constexpr char* members for windows NVCC]
21
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_to_sparse_bsr")
22
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
23
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_to_sparse_bsr(Tensor self, int[2] blocksize, int? dense_dim=None) -> Tensor")
24
+ static at::Tensor call(const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim);
25
+ static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim);
26
+ };
27
+
28
+ struct TORCH_API _to_sparse_bsr_out {
29
+ using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, c10::optional<int64_t>, at::Tensor &);
30
+ using ptr_schema = schema*;
31
+ // See Note [static constexpr char* members for windows NVCC]
32
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_to_sparse_bsr")
33
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
34
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_to_sparse_bsr.out(Tensor self, int[2] blocksize, int? dense_dim=None, *, Tensor(a!) out) -> Tensor(a!)")
35
+ static at::Tensor & call(const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim, at::Tensor & out);
36
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim, at::Tensor & out);
37
+ };
38
+
39
+ }} // namespace at::_ops
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_cuda_dispatch.h ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace cuda {
19
+
20
+ TORCH_API at::Tensor _to_sparse(const at::Tensor & self, int64_t sparse_dim);
21
+ TORCH_API at::Tensor _to_sparse(const at::Tensor & self, c10::optional<at::Layout> layout=c10::nullopt, at::OptionalIntArrayRef blocksize=c10::nullopt, c10::optional<int64_t> dense_dim=c10::nullopt);
22
+
23
+ } // namespace cuda
24
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_backward.h ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Function.h
4
+
5
+ #include <ATen/Context.h>
6
+ #include <ATen/DeviceGuard.h>
7
+ #include <ATen/TensorUtils.h>
8
+ #include <ATen/TracerMode.h>
9
+ #include <ATen/core/Generator.h>
10
+ #include <ATen/core/Reduction.h>
11
+ #include <ATen/core/Tensor.h>
12
+ #include <c10/core/Scalar.h>
13
+ #include <c10/core/Storage.h>
14
+ #include <c10/core/TensorOptions.h>
15
+ #include <c10/util/Deprecated.h>
16
+ #include <c10/util/Optional.h>
17
+
18
+
19
+
20
+ #include <ATen/ops/_upsample_bicubic2d_aa_backward_ops.h>
21
+
22
+ namespace at {
23
+
24
+
25
+ // aten::_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
26
+ inline at::Tensor & _upsample_bicubic2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
27
+ return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
28
+ }
29
+ namespace symint {
30
+ template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
31
+ at::Tensor & _upsample_bicubic2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
32
+ return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
33
+ }
34
+ }
35
+
36
+ // aten::_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
37
+ inline at::Tensor & _upsample_bicubic2d_aa_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
38
+ return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
39
+ }
40
+ namespace symint {
41
+ template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
42
+ at::Tensor & _upsample_bicubic2d_aa_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
43
+ return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
44
+ }
45
+ }
46
+
47
+ // aten::_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
48
+ inline at::Tensor & _upsample_bicubic2d_aa_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
49
+ return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
50
+ }
51
+ namespace symint {
52
+ template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
53
+ at::Tensor & _upsample_bicubic2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
54
+ return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
55
+ }
56
+ }
57
+
58
+ // aten::_upsample_bicubic2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
59
+ inline at::Tensor & _upsample_bicubic2d_aa_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
60
+ return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
61
+ }
62
+ namespace symint {
63
+ template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
64
+ at::Tensor & _upsample_bicubic2d_aa_backward_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
65
+ return at::_ops::_upsample_bicubic2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
66
+ }
67
+ }
68
+
69
+ // aten::_upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
70
+ inline at::Tensor _upsample_bicubic2d_aa_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
71
+ return at::_ops::_upsample_bicubic2d_aa_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w);
72
+ }
73
+ namespace symint {
74
+ template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
75
+ at::Tensor _upsample_bicubic2d_aa_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
76
+ return at::_ops::_upsample_bicubic2d_aa_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w);
77
+ }
78
+ }
79
+
80
+ // aten::_upsample_bicubic2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
81
+ inline at::Tensor _upsample_bicubic2d_aa_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
82
+ return at::_ops::_upsample_bicubic2d_aa_backward::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w);
83
+ }
84
+ namespace symint {
85
+ template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
86
+ at::Tensor _upsample_bicubic2d_aa_backward(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
87
+ return at::_ops::_upsample_bicubic2d_aa_backward::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w);
88
+ }
89
+ }
90
+
91
+ }
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/all_ops.h ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Operator.h
4
+
5
+ #include <tuple>
6
+ #include <vector>
7
+
8
+ // Forward declarations of any types needed in the operator signatures.
9
+ // We can't directly include these classes because it will cause circular include dependencies.
10
+ // This file is included by TensorBody.h, which defines the Tensor class.
11
+ #include <ATen/core/ATen_fwd.h>
12
+
13
+ namespace at {
14
+ namespace _ops {
15
+
16
+
17
+ struct TORCH_API all_dim {
18
+ using schema = at::Tensor (const at::Tensor &, int64_t, bool);
19
+ using ptr_schema = schema*;
20
+ // See Note [static constexpr char* members for windows NVCC]
21
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
22
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "dim")
23
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.dim(Tensor self, int dim, bool keepdim=False) -> Tensor")
24
+ static at::Tensor call(const at::Tensor & self, int64_t dim, bool keepdim);
25
+ static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim);
26
+ };
27
+
28
+ struct TORCH_API all_dims {
29
+ using schema = at::Tensor (const at::Tensor &, at::OptionalIntArrayRef, bool);
30
+ using ptr_schema = schema*;
31
+ // See Note [static constexpr char* members for windows NVCC]
32
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
33
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "dims")
34
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor")
35
+ static at::Tensor call(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim);
36
+ static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim);
37
+ };
38
+
39
+ struct TORCH_API all_out {
40
+ using schema = at::Tensor & (const at::Tensor &, int64_t, bool, at::Tensor &);
41
+ using ptr_schema = schema*;
42
+ // See Note [static constexpr char* members for windows NVCC]
43
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
44
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
45
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)")
46
+ static at::Tensor & call(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & out);
47
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & out);
48
+ };
49
+
50
+ struct TORCH_API all_dims_out {
51
+ using schema = at::Tensor & (const at::Tensor &, at::OptionalIntArrayRef, bool, at::Tensor &);
52
+ using ptr_schema = schema*;
53
+ // See Note [static constexpr char* members for windows NVCC]
54
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
55
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "dims_out")
56
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)")
57
+ static at::Tensor & call(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, at::Tensor & out);
58
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, at::Tensor & out);
59
+ };
60
+
61
+ struct TORCH_API all_dimname {
62
+ using schema = at::Tensor (const at::Tensor &, at::Dimname, bool);
63
+ using ptr_schema = schema*;
64
+ // See Note [static constexpr char* members for windows NVCC]
65
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
66
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "dimname")
67
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor")
68
+ static at::Tensor call(const at::Tensor & self, at::Dimname dim, bool keepdim);
69
+ static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim);
70
+ };
71
+
72
+ struct TORCH_API all_dimname_out {
73
+ using schema = at::Tensor & (const at::Tensor &, at::Dimname, bool, at::Tensor &);
74
+ using ptr_schema = schema*;
75
+ // See Note [static constexpr char* members for windows NVCC]
76
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
77
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "dimname_out")
78
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)")
79
+ static at::Tensor & call(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & out);
80
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & out);
81
+ };
82
+
83
+ struct TORCH_API all {
84
+ using schema = at::Tensor (const at::Tensor &);
85
+ using ptr_schema = schema*;
86
+ // See Note [static constexpr char* members for windows NVCC]
87
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
88
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
89
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all(Tensor self) -> Tensor")
90
+ static at::Tensor call(const at::Tensor & self);
91
+ static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
92
+ };
93
+
94
+ struct TORCH_API all_all_out {
95
+ using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
96
+ using ptr_schema = schema*;
97
+ // See Note [static constexpr char* members for windows NVCC]
98
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::all")
99
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "all_out")
100
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
101
+ static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
102
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
103
+ };
104
+
105
+ }} // namespace at::_ops
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arctan2_ops.h ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Operator.h
4
+
5
+ #include <tuple>
6
+ #include <vector>
7
+
8
+ // Forward declarations of any types needed in the operator signatures.
9
+ // We can't directly include these classes because it will cause circular include dependencies.
10
+ // This file is included by TensorBody.h, which defines the Tensor class.
11
+ #include <ATen/core/ATen_fwd.h>
12
+
13
+ namespace at {
14
+ namespace _ops {
15
+
16
+
17
+ struct TORCH_API arctan2 {
18
+ using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
19
+ using ptr_schema = schema*;
20
+ // See Note [static constexpr char* members for windows NVCC]
21
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::arctan2")
22
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
23
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "arctan2(Tensor self, Tensor other) -> Tensor")
24
+ static at::Tensor call(const at::Tensor & self, const at::Tensor & other);
25
+ static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other);
26
+ };
27
+
28
+ struct TORCH_API arctan2_out {
29
+ using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &);
30
+ using ptr_schema = schema*;
31
+ // See Note [static constexpr char* members for windows NVCC]
32
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::arctan2")
33
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
34
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "arctan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)")
35
+ static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
36
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
37
+ };
38
+
39
+ struct TORCH_API arctan2_ {
40
+ using schema = at::Tensor & (at::Tensor &, const at::Tensor &);
41
+ using ptr_schema = schema*;
42
+ // See Note [static constexpr char* members for windows NVCC]
43
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::arctan2_")
44
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
45
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "arctan2_(Tensor(a!) self, Tensor other) -> Tensor(a!)")
46
+ static at::Tensor & call(at::Tensor & self, const at::Tensor & other);
47
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other);
48
+ };
49
+
50
+ }} // namespace at::_ops
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/asin_compositeexplicitautogradnonfunctional_dispatch.h ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace compositeexplicitautogradnonfunctional {
19
+
20
+ TORCH_API at::Tensor asin(const at::Tensor & self);
21
+ TORCH_API at::Tensor & asin_(at::Tensor & self);
22
+
23
+ } // namespace compositeexplicitautogradnonfunctional
24
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/avg_pool2d_backward_native.h ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from NativeFunction.h
4
+
5
+ #include <c10/core/Scalar.h>
6
+ #include <c10/core/Storage.h>
7
+ #include <c10/core/TensorOptions.h>
8
+ #include <c10/util/Deprecated.h>
9
+ #include <c10/util/Optional.h>
10
+ #include <c10/core/QScheme.h>
11
+ #include <ATen/core/Reduction.h>
12
+ #include <ATen/core/Tensor.h>
13
+ #include <tuple>
14
+ #include <vector>
15
+ #include <ATen/ops/avg_pool2d_backward_meta.h>
16
+
17
+ namespace at {
18
+ namespace native {
19
+ struct TORCH_API structured_avg_pool2d_backward_out_cpu : public at::meta::structured_avg_pool2d_backward {
20
+ void impl(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, const at::Tensor & grad_input);
21
+ };
22
+ struct TORCH_API structured_avg_pool2d_backward_out_cuda : public at::meta::structured_avg_pool2d_backward {
23
+ void impl(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, const at::Tensor & grad_input);
24
+ };
25
+ TORCH_API at::Tensor mkldnn_avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override);
26
+ TORCH_API at::Tensor & mkldnn_avg_pool2d_backward_out(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional<int64_t> divisor_override, at::Tensor & grad_input);
27
+ } // namespace native
28
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_xor_meta.h ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from NativeMetaFunction.h
4
+
5
+ #include <c10/core/Scalar.h>
6
+ #include <c10/core/Storage.h>
7
+ #include <c10/core/TensorOptions.h>
8
+ #include <c10/util/Deprecated.h>
9
+ #include <c10/util/Optional.h>
10
+ #include <c10/core/QScheme.h>
11
+ #include <ATen/core/Reduction.h>
12
+ #include <ATen/TensorIterator.h>
13
+ #include <ATen/TensorMeta.h>
14
+ #include <tuple>
15
+ #include <vector>
16
+
17
+ namespace at {
18
+ namespace meta {
19
+
20
+ struct TORCH_API structured_bitwise_xor_Tensor : public TensorIteratorBase {
21
+
22
+
23
+ void meta(const at::Tensor & self, const at::Tensor & other);
24
+ };
25
+
26
+ } // namespace native
27
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ceil_meta.h ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from NativeMetaFunction.h
4
+
5
+ #include <c10/core/Scalar.h>
6
+ #include <c10/core/Storage.h>
7
+ #include <c10/core/TensorOptions.h>
8
+ #include <c10/util/Deprecated.h>
9
+ #include <c10/util/Optional.h>
10
+ #include <c10/core/QScheme.h>
11
+ #include <ATen/core/Reduction.h>
12
+ #include <ATen/TensorIterator.h>
13
+ #include <ATen/TensorMeta.h>
14
+ #include <tuple>
15
+ #include <vector>
16
+
17
+ namespace at {
18
+ namespace meta {
19
+
20
+ struct TORCH_API structured_ceil : public TensorIteratorBase {
21
+
22
+
23
+ void meta(const at::Tensor & self);
24
+ };
25
+
26
+ } // namespace native
27
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/clamp_ops.h ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Operator.h
4
+
5
+ #include <tuple>
6
+ #include <vector>
7
+
8
+ // Forward declarations of any types needed in the operator signatures.
9
+ // We can't directly include these classes because it will cause circular include dependencies.
10
+ // This file is included by TensorBody.h, which defines the Tensor class.
11
+ #include <ATen/core/ATen_fwd.h>
12
+
13
+ namespace at {
14
+ namespace _ops {
15
+
16
+
17
+ struct TORCH_API clamp {
18
+ using schema = at::Tensor (const at::Tensor &, const c10::optional<at::Scalar> &, const c10::optional<at::Scalar> &);
19
+ using ptr_schema = schema*;
20
+ // See Note [static constexpr char* members for windows NVCC]
21
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::clamp")
22
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
23
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "clamp(Tensor self, Scalar? min=None, Scalar? max=None) -> Tensor")
24
+ static at::Tensor call(const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max);
25
+ static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max);
26
+ };
27
+
28
+ struct TORCH_API clamp_Tensor {
29
+ using schema = at::Tensor (const at::Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &);
30
+ using ptr_schema = schema*;
31
+ // See Note [static constexpr char* members for windows NVCC]
32
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::clamp")
33
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor")
34
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "clamp.Tensor(Tensor self, Tensor? min=None, Tensor? max=None) -> Tensor")
35
+ static at::Tensor call(const at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max);
36
+ static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max);
37
+ };
38
+
39
+ struct TORCH_API clamp_ {
40
+ using schema = at::Tensor & (at::Tensor &, const c10::optional<at::Scalar> &, const c10::optional<at::Scalar> &);
41
+ using ptr_schema = schema*;
42
+ // See Note [static constexpr char* members for windows NVCC]
43
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::clamp_")
44
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
45
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "clamp_(Tensor(a!) self, Scalar? min=None, Scalar? max=None) -> Tensor(a!)")
46
+ static at::Tensor & call(at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max);
47
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max);
48
+ };
49
+
50
+ struct TORCH_API clamp__Tensor {
51
+ using schema = at::Tensor & (at::Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &);
52
+ using ptr_schema = schema*;
53
+ // See Note [static constexpr char* members for windows NVCC]
54
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::clamp_")
55
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor")
56
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "clamp_.Tensor(Tensor(a!) self, Tensor? min=None, Tensor? max=None) -> Tensor(a!)")
57
+ static at::Tensor & call(at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max);
58
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max);
59
+ };
60
+
61
+ struct TORCH_API clamp_out {
62
+ using schema = at::Tensor & (const at::Tensor &, const c10::optional<at::Scalar> &, const c10::optional<at::Scalar> &, at::Tensor &);
63
+ using ptr_schema = schema*;
64
+ // See Note [static constexpr char* members for windows NVCC]
65
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::clamp")
66
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
67
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "clamp.out(Tensor self, Scalar? min=None, Scalar? max=None, *, Tensor(a!) out) -> Tensor(a!)")
68
+ static at::Tensor & call(const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max, at::Tensor & out);
69
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Scalar> & min, const c10::optional<at::Scalar> & max, at::Tensor & out);
70
+ };
71
+
72
+ struct TORCH_API clamp_Tensor_out {
73
+ using schema = at::Tensor & (const at::Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, at::Tensor &);
74
+ using ptr_schema = schema*;
75
+ // See Note [static constexpr char* members for windows NVCC]
76
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::clamp")
77
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor_out")
78
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "clamp.Tensor_out(Tensor self, Tensor? min=None, Tensor? max=None, *, Tensor(a!) out) -> Tensor(a!)")
79
+ static at::Tensor & call(const at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max, at::Tensor & out);
80
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const c10::optional<at::Tensor> & min, const c10::optional<at::Tensor> & max, at::Tensor & out);
81
+ };
82
+
83
+ }} // namespace at::_ops
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/convolution_backward.h ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Function.h
4
+
5
+ #include <ATen/Context.h>
6
+ #include <ATen/DeviceGuard.h>
7
+ #include <ATen/TensorUtils.h>
8
+ #include <ATen/TracerMode.h>
9
+ #include <ATen/core/Generator.h>
10
+ #include <ATen/core/Reduction.h>
11
+ #include <ATen/core/Tensor.h>
12
+ #include <c10/core/Scalar.h>
13
+ #include <c10/core/Storage.h>
14
+ #include <c10/core/TensorOptions.h>
15
+ #include <c10/util/Deprecated.h>
16
+ #include <c10/util/Optional.h>
17
+
18
+
19
+
20
+ #include <ATen/ops/convolution_backward_ops.h>
21
+
22
+ namespace at {
23
+
24
+
25
+ // aten::convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
26
+ inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
27
+ return at::_ops::convolution_backward::call(grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask);
28
+ }
29
+ namespace symint {
30
+ template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
31
+ ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
32
+ return at::_ops::convolution_backward::call(grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask);
33
+ }
34
+ }
35
+
36
+ // aten::convolution_backward(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
37
+ inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward_symint(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
38
+ return at::_ops::convolution_backward::call(grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask);
39
+ }
40
+ namespace symint {
41
+ template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
42
+ ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
43
+ return at::_ops::convolution_backward::call(grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask);
44
+ }
45
+ }
46
+
47
+ // aten::convolution_backward.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
48
+ inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
49
+ return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask, out0, out1, out2);
50
+ }
51
+ namespace symint {
52
+ template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
53
+ ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
54
+ return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask, out0, out1, out2);
55
+ }
56
+ }
57
+
58
+ // aten::convolution_backward.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
59
+ inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
60
+ return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask, out0, out1, out2);
61
+ }
62
+ namespace symint {
63
+ template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
64
+ ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
65
+ return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*bias_sizes)) : c10::nullopt, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask, out0, out1, out2);
66
+ }
67
+ }
68
+
69
+ // aten::convolution_backward.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
70
+ inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_symint_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
71
+ return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask, out0, out1, out2);
72
+ }
73
+ namespace symint {
74
+ template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
75
+ ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
76
+ return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask, out0, out1, out2);
77
+ }
78
+ }
79
+
80
+ // aten::convolution_backward.out(Tensor grad_output, Tensor input, Tensor weight, SymInt[]? bias_sizes, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
81
+ inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
82
+ return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask, out0, out1, out2);
83
+ }
84
+ namespace symint {
85
+ template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
86
+ ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
87
+ return at::_ops::convolution_backward_out::call(grad_output, input, weight, bias_sizes, stride, padding, dilation, transposed, output_padding, groups, output_mask, out0, out1, out2);
88
+ }
89
+ }
90
+
91
+ }
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/convolution_backward_cuda_dispatch.h ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace cuda {
19
+
20
+ TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask);
21
+ TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward_symint(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask);
22
+
23
+ } // namespace cuda
24
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_add_relu.h ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Function.h
4
+
5
+ #include <ATen/Context.h>
6
+ #include <ATen/DeviceGuard.h>
7
+ #include <ATen/TensorUtils.h>
8
+ #include <ATen/TracerMode.h>
9
+ #include <ATen/core/Generator.h>
10
+ #include <ATen/core/Reduction.h>
11
+ #include <ATen/core/Tensor.h>
12
+ #include <c10/core/Scalar.h>
13
+ #include <c10/core/Storage.h>
14
+ #include <c10/core/TensorOptions.h>
15
+ #include <c10/util/Deprecated.h>
16
+ #include <c10/util/Optional.h>
17
+
18
+
19
+
20
+ #include <ATen/ops/cudnn_convolution_add_relu_ops.h>
21
+
22
+ namespace at {
23
+
24
+
25
+ // aten::cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
26
+ inline at::Tensor cudnn_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
27
+ return at::_ops::cudnn_convolution_add_relu::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
28
+ }
29
+ namespace symint {
30
+ template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
31
+ at::Tensor cudnn_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
32
+ return at::_ops::cudnn_convolution_add_relu::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
33
+ }
34
+ }
35
+
36
+ // aten::cudnn_convolution_add_relu(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor
37
+ inline at::Tensor cudnn_convolution_add_relu_symint(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
38
+ return at::_ops::cudnn_convolution_add_relu::call(self, weight, z, alpha, bias, stride, padding, dilation, groups);
39
+ }
40
+ namespace symint {
41
+ template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
42
+ at::Tensor cudnn_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
43
+ return at::_ops::cudnn_convolution_add_relu::call(self, weight, z, alpha, bias, stride, padding, dilation, groups);
44
+ }
45
+ }
46
+
47
+ // aten::cudnn_convolution_add_relu.out(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
48
+ inline at::Tensor & cudnn_convolution_add_relu_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
49
+ return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups, out);
50
+ }
51
+ namespace symint {
52
+ template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
53
+ at::Tensor & cudnn_convolution_add_relu_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups) {
54
+ return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups, out);
55
+ }
56
+ }
57
+
58
+ // aten::cudnn_convolution_add_relu.out(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
59
+ inline at::Tensor & cudnn_convolution_add_relu_outf(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) {
60
+ return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups, out);
61
+ }
62
+ namespace symint {
63
+ template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
64
+ at::Tensor & cudnn_convolution_add_relu_outf(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups, at::Tensor & out) {
65
+ return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups, out);
66
+ }
67
+ }
68
+
69
+ // aten::cudnn_convolution_add_relu.out(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
70
+ inline at::Tensor & cudnn_convolution_add_relu_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
71
+ return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, stride, padding, dilation, groups, out);
72
+ }
73
+ namespace symint {
74
+ template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
75
+ at::Tensor & cudnn_convolution_add_relu_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups) {
76
+ return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, stride, padding, dilation, groups, out);
77
+ }
78
+ }
79
+
80
+ // aten::cudnn_convolution_add_relu.out(Tensor self, Tensor weight, Tensor z, Scalar? alpha, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)
81
+ inline at::Tensor & cudnn_convolution_add_relu_symint_outf(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) {
82
+ return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, stride, padding, dilation, groups, out);
83
+ }
84
+ namespace symint {
85
+ template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
86
+ at::Tensor & cudnn_convolution_add_relu_outf(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out) {
87
+ return at::_ops::cudnn_convolution_add_relu_out::call(self, weight, z, alpha, bias, stride, padding, dilation, groups, out);
88
+ }
89
+ }
90
+
91
+ }
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_add_relu_native.h ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from NativeFunction.h
4
+
5
+ #include <c10/core/Scalar.h>
6
+ #include <c10/core/Storage.h>
7
+ #include <c10/core/TensorOptions.h>
8
+ #include <c10/util/Deprecated.h>
9
+ #include <c10/util/Optional.h>
10
+ #include <c10/core/QScheme.h>
11
+ #include <ATen/core/Reduction.h>
12
+ #include <ATen/core/Tensor.h>
13
+ #include <tuple>
14
+ #include <vector>
15
+
16
+
17
+ namespace at {
18
+ namespace native {
19
+ TORCH_API at::Tensor & cudnn_convolution_add_relu_out_symint(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out);
20
+ TORCH_API at::Tensor cudnn_convolution_add_relu(const at::Tensor & self, const at::Tensor & weight, const at::Tensor & z, const c10::optional<at::Scalar> & alpha, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, int64_t groups);
21
+ } // namespace native
22
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_convolution_native.h ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from NativeFunction.h
4
+
5
+ #include <c10/core/Scalar.h>
6
+ #include <c10/core/Storage.h>
7
+ #include <c10/core/TensorOptions.h>
8
+ #include <c10/util/Deprecated.h>
9
+ #include <c10/util/Optional.h>
10
+ #include <c10/core/QScheme.h>
11
+ #include <ATen/core/Reduction.h>
12
+ #include <ATen/core/Tensor.h>
13
+ #include <tuple>
14
+ #include <vector>
15
+
16
+
17
+ namespace at {
18
+ namespace native {
19
+ TORCH_API at::Tensor cudnn_convolution(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32);
20
+ TORCH_API at::Tensor & cudnn_convolution_out(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic, bool allow_tf32, at::Tensor & out);
21
+ } // namespace native
22
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/eye_compositeexplicitautograd_dispatch.h ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace compositeexplicitautograd {
19
+
20
+ TORCH_API at::Tensor eye(int64_t n, at::TensorOptions options={});
21
+ TORCH_API at::Tensor eye(int64_t n, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
22
+ TORCH_API at::Tensor eye_symint(c10::SymInt n, at::TensorOptions options={});
23
+ TORCH_API at::Tensor eye_symint(c10::SymInt n, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
24
+ TORCH_API at::Tensor eye(int64_t n, int64_t m, at::TensorOptions options={});
25
+ TORCH_API at::Tensor eye(int64_t n, int64_t m, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
26
+ TORCH_API at::Tensor eye_symint(c10::SymInt n, c10::SymInt m, at::TensorOptions options={});
27
+ TORCH_API at::Tensor eye_symint(c10::SymInt n, c10::SymInt m, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
28
+
29
+ } // namespace compositeexplicitautograd
30
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fbgemm_pack_gemm_matrix_fp16.h ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Function.h
4
+
5
+ #include <ATen/Context.h>
6
+ #include <ATen/DeviceGuard.h>
7
+ #include <ATen/TensorUtils.h>
8
+ #include <ATen/TracerMode.h>
9
+ #include <ATen/core/Generator.h>
10
+ #include <ATen/core/Reduction.h>
11
+ #include <ATen/core/Tensor.h>
12
+ #include <c10/core/Scalar.h>
13
+ #include <c10/core/Storage.h>
14
+ #include <c10/core/TensorOptions.h>
15
+ #include <c10/util/Deprecated.h>
16
+ #include <c10/util/Optional.h>
17
+
18
+
19
+
20
+ #include <ATen/ops/fbgemm_pack_gemm_matrix_fp16_ops.h>
21
+
22
+ namespace at {
23
+
24
+
25
+ // aten::fbgemm_pack_gemm_matrix_fp16(Tensor input) -> Tensor
26
+ inline at::Tensor fbgemm_pack_gemm_matrix_fp16(const at::Tensor & input) {
27
+ return at::_ops::fbgemm_pack_gemm_matrix_fp16::call(input);
28
+ }
29
+
30
+ }
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_irfftn_ops.h ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Operator.h
4
+
5
+ #include <tuple>
6
+ #include <vector>
7
+
8
+ // Forward declarations of any types needed in the operator signatures.
9
+ // We can't directly include these classes because it will cause circular include dependencies.
10
+ // This file is included by TensorBody.h, which defines the Tensor class.
11
+ #include <ATen/core/ATen_fwd.h>
12
+
13
+ namespace at {
14
+ namespace _ops {
15
+
16
+
17
+ struct TORCH_API fft_irfftn {
18
+ using schema = at::Tensor (const at::Tensor &, at::OptionalSymIntArrayRef, at::OptionalIntArrayRef, c10::optional<c10::string_view>);
19
+ using ptr_schema = schema*;
20
+ // See Note [static constexpr char* members for windows NVCC]
21
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::fft_irfftn")
22
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
23
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "fft_irfftn(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None) -> Tensor")
24
+ static at::Tensor call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm);
25
+ static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm);
26
+ };
27
+
28
+ struct TORCH_API fft_irfftn_out {
29
+ using schema = at::Tensor & (const at::Tensor &, at::OptionalSymIntArrayRef, at::OptionalIntArrayRef, c10::optional<c10::string_view>, at::Tensor &);
30
+ using ptr_schema = schema*;
31
+ // See Note [static constexpr char* members for windows NVCC]
32
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::fft_irfftn")
33
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
34
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "fft_irfftn.out(Tensor self, SymInt[1]? s=None, int[1]? dim=None, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)")
35
+ static at::Tensor & call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out);
36
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out);
37
+ };
38
+
39
+ }} // namespace at::_ops
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fractional_max_pool3d_cpu_dispatch.h ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace cpu {
19
+
20
+ TORCH_API ::std::tuple<at::Tensor,at::Tensor> fractional_max_pool3d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples);
21
+ TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> fractional_max_pool3d_out(at::Tensor & output, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples);
22
+ TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> fractional_max_pool3d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef output_size, const at::Tensor & random_samples, at::Tensor & output, at::Tensor & indices);
23
+
24
+ } // namespace cpu
25
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/glu_jvp_ops.h ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Operator.h
4
+
5
+ #include <tuple>
6
+ #include <vector>
7
+
8
+ // Forward declarations of any types needed in the operator signatures.
9
+ // We can't directly include these classes because it will cause circular include dependencies.
10
+ // This file is included by TensorBody.h, which defines the Tensor class.
11
+ #include <ATen/core/ATen_fwd.h>
12
+
13
+ namespace at {
14
+ namespace _ops {
15
+
16
+
17
+ struct TORCH_API glu_jvp {
18
+ using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t);
19
+ using ptr_schema = schema*;
20
+ // See Note [static constexpr char* members for windows NVCC]
21
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::glu_jvp")
22
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
23
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "glu_jvp(Tensor glu, Tensor x, Tensor dx, int dim) -> Tensor")
24
+ static at::Tensor call(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim);
25
+ static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim);
26
+ };
27
+
28
+ struct TORCH_API glu_jvp_out {
29
+ using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &);
30
+ using ptr_schema = schema*;
31
+ // See Note [static constexpr char* members for windows NVCC]
32
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::glu_jvp")
33
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
34
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "glu_jvp.out(Tensor glu, Tensor x, Tensor dx, int dim, *, Tensor(a!) out) -> Tensor(a!)")
35
+ static at::Tensor & call(const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim, at::Tensor & out);
36
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & glu, const at::Tensor & x, const at::Tensor & dx, int64_t dim, at::Tensor & out);
37
+ };
38
+
39
+ }} // namespace at::_ops
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hardswish_backward_cpu_dispatch.h ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace cpu {
19
+
20
+ TORCH_API at::Tensor hardswish_backward(const at::Tensor & grad_output, const at::Tensor & self);
21
+
22
+ } // namespace cpu
23
+ } // namespace at
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hardswish_ops.h ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+
3
+ // @generated by torchgen/gen.py from Operator.h
4
+
5
+ #include <tuple>
6
+ #include <vector>
7
+
8
+ // Forward declarations of any types needed in the operator signatures.
9
+ // We can't directly include these classes because it will cause circular include dependencies.
10
+ // This file is included by TensorBody.h, which defines the Tensor class.
11
+ #include <ATen/core/ATen_fwd.h>
12
+
13
+ namespace at {
14
+ namespace _ops {
15
+
16
+
17
+ struct TORCH_API hardswish_out {
18
+ using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
19
+ using ptr_schema = schema*;
20
+ // See Note [static constexpr char* members for windows NVCC]
21
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::hardswish")
22
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
23
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "hardswish.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
24
+ static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
25
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
26
+ };
27
+
28
+ struct TORCH_API hardswish {
29
+ using schema = at::Tensor (const at::Tensor &);
30
+ using ptr_schema = schema*;
31
+ // See Note [static constexpr char* members for windows NVCC]
32
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::hardswish")
33
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
34
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "hardswish(Tensor self) -> Tensor")
35
+ static at::Tensor call(const at::Tensor & self);
36
+ static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
37
+ };
38
+
39
+ struct TORCH_API hardswish_ {
40
+ using schema = at::Tensor & (at::Tensor &);
41
+ using ptr_schema = schema*;
42
+ // See Note [static constexpr char* members for windows NVCC]
43
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::hardswish_")
44
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
45
+ STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "hardswish_(Tensor(a!) self) -> Tensor(a!)")
46
+ static at::Tensor & call(at::Tensor & self);
47
+ static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
48
+ };
49
+
50
+ }} // namespace at::_ops
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_put_compositeexplicitautograd_dispatch.h ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #pragma once
2
+ // @generated by torchgen/gen.py from DispatchKeyFunction.h
3
+
4
+ // NB: The implementing C++ file is RegisterDispatchKey.cpp
5
+
6
+ // The only #includes we need are for custom classes that have defaults in the C++ API
7
+ #include <c10/core/MemoryFormat.h>
8
+ #include <c10/core/Scalar.h>
9
+ #include <ATen/core/Reduction.h>
10
+
11
+ // Forward declarations of any types needed in the operator signatures.
12
+ // We can't directly include these classes because it will cause circular include dependencies.
13
+ // This file is included by TensorBody.h, which defines the Tensor class.
14
+ #include <ATen/core/ATen_fwd.h>
15
+
16
+ namespace at {
17
+
18
+ namespace compositeexplicitautograd {
19
+
20
+ TORCH_API at::Tensor index_put(const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false);
21
+ TORCH_API at::Tensor & index_put_out(at::Tensor & out, const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false);
22
+ TORCH_API at::Tensor & index_put_outf(const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate, at::Tensor & out);
23
+ TORCH_API at::Tensor & index_put_(at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices, const at::Tensor & values, bool accumulate=false);
24
+
25
+ } // namespace compositeexplicitautograd
26
+ } // namespace at