diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_adaptive_avg_pool3d_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_adaptive_avg_pool3d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ec299d29b0884e4ba3b5db3e0fa9876fc851140
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_adaptive_avg_pool3d_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _adaptive_avg_pool3d {
+  using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_adaptive_avg_pool3d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_adaptive_avg_pool3d(Tensor self, SymInt[3] output_size) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef output_size);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size);
+};
+
+struct TORCH_API _adaptive_avg_pool3d_out {
+  using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_adaptive_avg_pool3d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_adaptive_avg_pool3d.out(Tensor self, SymInt[3] output_size, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cummax_helper_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cummax_helper_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..64db1f76c08e21e67a590c1f3c93cc73a524a2ac
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cummax_helper_cpu_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API void _cummax_helper(const at::Tensor & self, at::Tensor & values, at::Tensor & indices, int64_t dim);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..e78a05dd3a2d7e0642ede3f7a1b863b9817bea3a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _fake_quantize_learnable_per_tensor_affine {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, int64_t, double);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_fake_quantize_learnable_per_tensor_affine")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_fake_quantize_learnable_per_tensor_affine(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor);
+};
+
+struct TORCH_API _fake_quantize_learnable_per_tensor_affine_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, int64_t, double, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_fake_quantize_learnable_per_tensor_affine")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_fake_quantize_learnable_per_tensor_affine.out(Tensor self, Tensor scale, Tensor zero_point, int quant_min, int quant_max, float grad_factor=1.0, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_asin.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_asin.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b4a0adb20a0fa3a5daa757cc3d84df97ae95653
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_asin.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_foreach_asin_ops.h>
+
+namespace at {
+
+
+// aten::_foreach_asin(Tensor[] self) -> Tensor[]
+inline ::std::vector<at::Tensor> _foreach_asin(at::TensorList self) {
+    return at::_ops::_foreach_asin::call(self);
+}
+
+// aten::_foreach_asin_(Tensor(a!)[] self) -> ()
+inline void _foreach_asin_(at::TensorList self) {
+    return at::_ops::_foreach_asin_::call(self);
+}
+
+// aten::_foreach_asin.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_asin_out(at::TensorList out, at::TensorList self) {
+    return at::_ops::_foreach_asin_out::call(self, out);
+}
+// aten::_foreach_asin.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_asin_outf(at::TensorList self, at::TensorList out) {
+    return at::_ops::_foreach_asin_out::call(self, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_frac_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_frac_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..be2715382e60a5ef9e942bdf1932e524d7276299
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_frac_cuda_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::vector<at::Tensor> _foreach_frac(at::TensorList self);
+TORCH_API void _foreach_frac_(at::TensorList self);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log2_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log2_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea6cd9a690771fbf690e9e97cf28af4d2b296d61
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log2_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API void _foreach_log2_out(at::TensorList out, at::TensorList self);
+TORCH_API void _foreach_log2_outf(at::TensorList self, at::TensorList out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_sqrt.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_sqrt.h
new file mode 100644
index 0000000000000000000000000000000000000000..84400e1dd4cb00304502d931c31f67358ee2e0aa
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_sqrt.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_foreach_sqrt_ops.h>
+
+namespace at {
+
+
+// aten::_foreach_sqrt(Tensor[] self) -> Tensor[]
+inline ::std::vector<at::Tensor> _foreach_sqrt(at::TensorList self) {
+    return at::_ops::_foreach_sqrt::call(self);
+}
+
+// aten::_foreach_sqrt_(Tensor(a!)[] self) -> ()
+inline void _foreach_sqrt_(at::TensorList self) {
+    return at::_ops::_foreach_sqrt_::call(self);
+}
+
+// aten::_foreach_sqrt.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_sqrt_out(at::TensorList out, at::TensorList self) {
+    return at::_ops::_foreach_sqrt_out::call(self, out);
+}
+// aten::_foreach_sqrt.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_sqrt_outf(at::TensorList self, at::TensorList out) {
+    return at::_ops::_foreach_sqrt_out::call(self, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbbdcbd019121541d352e98b4936e0099c9dad54
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_tensor_from_mask_cuda_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor _nested_tensor_from_mask(const at::Tensor & t, const at::Tensor & mask, bool mask_check=true);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_tensor_from_tensor_list_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_tensor_from_tensor_list_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a4b019d6267307307b9e95d5e1b2508384e6d23
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_tensor_from_tensor_list_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor _nested_tensor_from_tensor_list(at::TensorList list, c10::optional<at::ScalarType> dtype=c10::nullopt, c10::optional<at::Layout> layout=c10::nullopt, c10::optional<at::Device> device=c10::nullopt, c10::optional<bool> pin_memory=c10::nullopt);
+TORCH_API at::Tensor & _nested_tensor_from_tensor_list_out(at::Tensor & out, at::TensorList list, c10::optional<at::ScalarType> dtype=c10::nullopt, c10::optional<at::Layout> layout=c10::nullopt, c10::optional<at::Device> device=c10::nullopt, c10::optional<bool> pin_memory=c10::nullopt);
+TORCH_API at::Tensor & _nested_tensor_from_tensor_list_outf(at::TensorList list, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_view_from_jagged_copy_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_view_from_jagged_copy_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e821e42b1d1babf4976f8eb415c4e659987dbf5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_view_from_jagged_copy_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & _nested_view_from_jagged_copy_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & offsets, const at::Tensor & dummy, const c10::optional<at::Tensor> & lengths={}, int64_t ragged_idx=1);
+TORCH_API at::Tensor & _nested_view_from_jagged_copy_outf(const at::Tensor & self, const at::Tensor & offsets, const at::Tensor & dummy, const c10::optional<at::Tensor> & lengths, int64_t ragged_idx, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_propagate_xla_data_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_propagate_xla_data_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a00740618fe20ea073354ac1ee433cd86810bd0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_propagate_xla_data_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _propagate_xla_data {
+  using schema = void (const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_propagate_xla_data")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_propagate_xla_data(Tensor input, Tensor output) -> ()")
+  static void call(const at::Tensor & input, const at::Tensor & output);
+  static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & output);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..28eec721e84b0a1355ec708aee24e680a50b5797
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention_cuda_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor> _scaled_dot_product_efficient_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & attn_bias, bool compute_log_sumexp, double dropout_p=0.0, bool is_causal=false, c10::optional<double> scale=c10::nullopt);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_with_dims.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_with_dims.h
new file mode 100644
index 0000000000000000000000000000000000000000..209eb3ccef96f290164785c736c4dde47c14618d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_with_dims.h
@@ -0,0 +1,43 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_sparse_coo_tensor_with_dims_ops.h>
+
+namespace at {
+
+
+// aten::_sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+inline at::Tensor _sparse_coo_tensor_with_dims(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, at::TensorOptions options) {
+    return at::_ops::_sparse_coo_tensor_with_dims::call(sparse_dim, dense_dim, size, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+// aten::_sparse_coo_tensor_with_dims(int sparse_dim, int dense_dim, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor
+inline at::Tensor _sparse_coo_tensor_with_dims(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    return at::_ops::_sparse_coo_tensor_with_dims::call(sparse_dim, dense_dim, size, dtype, layout, device, pin_memory);
+}
+
+// aten::_sparse_coo_tensor_with_dims.out(int sparse_dim, int dense_dim, int[] size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _sparse_coo_tensor_with_dims_out(at::Tensor & out, int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size) {
+    return at::_ops::_sparse_coo_tensor_with_dims_out::call(sparse_dim, dense_dim, size, out);
+}
+// aten::_sparse_coo_tensor_with_dims.out(int sparse_dim, int dense_dim, int[] size, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _sparse_coo_tensor_with_dims_outf(int64_t sparse_dim, int64_t dense_dim, at::IntArrayRef size, at::Tensor & out) {
+    return at::_ops::_sparse_coo_tensor_with_dims_out::call(sparse_dim, dense_dim, size, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sparse_matmul_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sparse_matmul_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..594af3b59ccfb046cafa0bed42972d3998c1c6d2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sparse_matmul_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & _sparse_sparse_matmul_out(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+TORCH_API at::Tensor sparse_sparse_matmul_cpu(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor sparse_sparse_matmul_cuda(const at::Tensor & self, const at::Tensor & other);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_thnn_differentiable_gru_cell_backward_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_thnn_differentiable_gru_cell_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..55be257872606a14eb4dfa16639dce066b3e6aac
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_thnn_differentiable_gru_cell_backward_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _thnn_differentiable_gru_cell_backward(const at::Tensor & grad_hy, const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const c10::optional<at::Tensor> & input_bias, const c10::optional<at::Tensor> & hidden_bias);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_semi_structured_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_semi_structured_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..a40b83487868746413074479be59a95ab4547014
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_semi_structured_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> _to_sparse_semi_structured(const at::Tensor & dense);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa.h
new file mode 100644
index 0000000000000000000000000000000000000000..239adc042e957e59a236d3c02daf358381f64c0e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa.h
@@ -0,0 +1,113 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_upsample_bilinear2d_aa_ops.h>
+
+namespace at {
+
+
+// aten::_upsample_bilinear2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+inline at::Tensor _upsample_bilinear2d_aa(const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+    return at::_ops::_upsample_bilinear2d_aa_vec::call(input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, align_corners, scale_factors);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor _upsample_bilinear2d_aa(const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+    return at::_ops::_upsample_bilinear2d_aa_vec::call(input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, align_corners, scale_factors);
+  }
+}
+
+// aten::_upsample_bilinear2d_aa.vec(Tensor input, SymInt[]? output_size, bool align_corners, float[]? scale_factors) -> Tensor
+inline at::Tensor _upsample_bilinear2d_aa_symint(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+    return at::_ops::_upsample_bilinear2d_aa_vec::call(input, output_size, align_corners, scale_factors);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor _upsample_bilinear2d_aa(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors) {
+    return at::_ops::_upsample_bilinear2d_aa_vec::call(input, output_size, align_corners, scale_factors);
+  }
+}
+
+// aten::_upsample_bilinear2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _upsample_bilinear2d_aa_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa_out::call(self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & _upsample_bilinear2d_aa_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa_out::call(self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w, out);
+  }
+}
+
+// aten::_upsample_bilinear2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _upsample_bilinear2d_aa_outf(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+    return at::_ops::_upsample_bilinear2d_aa_out::call(self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & _upsample_bilinear2d_aa_outf(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+    return at::_ops::_upsample_bilinear2d_aa_out::call(self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w, out);
+  }
+}
+
+// aten::_upsample_bilinear2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _upsample_bilinear2d_aa_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa_out::call(self, output_size, align_corners, scales_h, scales_w, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & _upsample_bilinear2d_aa_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa_out::call(self, output_size, align_corners, scales_h, scales_w, out);
+  }
+}
+
+// aten::_upsample_bilinear2d_aa.out(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _upsample_bilinear2d_aa_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+    return at::_ops::_upsample_bilinear2d_aa_out::call(self, output_size, align_corners, scales_h, scales_w, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & _upsample_bilinear2d_aa_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+    return at::_ops::_upsample_bilinear2d_aa_out::call(self, output_size, align_corners, scales_h, scales_w, out);
+  }
+}
+
+// aten::_upsample_bilinear2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+inline at::Tensor _upsample_bilinear2d_aa(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa::call(self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor _upsample_bilinear2d_aa(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa::call(self, c10::fromIntArrayRefSlow(output_size), align_corners, scales_h, scales_w);
+  }
+}
+
+// aten::_upsample_bilinear2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+inline at::Tensor _upsample_bilinear2d_aa_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa::call(self, output_size, align_corners, scales_h, scales_w);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor _upsample_bilinear2d_aa(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa::call(self, output_size, align_corners, scales_h, scales_w);
+  }
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/addr.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/addr.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d28c4de3f7fb3242f947d13773ff17baef9ad23
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/addr.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/addr_ops.h>
+
+namespace at {
+
+
+// aten::addr(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+inline at::Tensor addr(const at::Tensor & self, const at::Tensor & vec1, const at::Tensor & vec2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+    return at::_ops::addr::call(self, vec1, vec2, beta, alpha);
+}
+
+// aten::addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & addr_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & vec1, const at::Tensor & vec2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+    return at::_ops::addr_out::call(self, vec1, vec2, beta, alpha, out);
+}
+// aten::addr.out(Tensor self, Tensor vec1, Tensor vec2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & addr_outf(const at::Tensor & self, const at::Tensor & vec1, const at::Tensor & vec2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out) {
+    return at::_ops::addr_out::call(self, vec1, vec2, beta, alpha, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/all_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/all_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d1e71888d22493972849fedf700d9fbdae737cd3
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/all_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor all(const at::Tensor & self, at::Dimname dim, bool keepdim=false);
+TORCH_API at::Tensor & all_out(at::Tensor & out, const at::Tensor & self, at::Dimname dim, bool keepdim=false);
+TORCH_API at::Tensor & all_outf(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atanh_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atanh_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..401c0f867307016cbcc059646523bfb116459357
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atanh_cuda_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor atanh(const at::Tensor & self);
+TORCH_API at::Tensor & atanh_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & atanh_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & atanh_(at::Tensor & self);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bartlett_window_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bartlett_window_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..05f8e3fb21f21825abad1674ea2aebef79db4c43
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bartlett_window_ops.h
@@ -0,0 +1,61 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API bartlett_window {
+  using schema = at::Tensor (int64_t, c10::optional<at::ScalarType>, c10::optional<at::Layout>, c10::optional<at::Device>, c10::optional<bool>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bartlett_window")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor")
+  static at::Tensor call(int64_t window_length, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, int64_t window_length, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+};
+
+struct TORCH_API bartlett_window_periodic {
+  using schema = at::Tensor (int64_t, bool, c10::optional<at::ScalarType>, c10::optional<at::Layout>, c10::optional<at::Device>, c10::optional<bool>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bartlett_window")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "periodic")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bartlett_window.periodic(int window_length, bool periodic, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor")
+  static at::Tensor call(int64_t window_length, bool periodic, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+};
+
+struct TORCH_API bartlett_window_out {
+  using schema = at::Tensor & (int64_t, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bartlett_window")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bartlett_window.out(int window_length, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(int64_t window_length, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, int64_t window_length, at::Tensor & out);
+};
+
+struct TORCH_API bartlett_window_periodic_out {
+  using schema = at::Tensor & (int64_t, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bartlett_window")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "periodic_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bartlett_window.periodic_out(int window_length, bool periodic, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(int64_t window_length, bool periodic, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, int64_t window_length, bool periodic, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/batch_norm_gather_stats_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/batch_norm_gather_stats_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaddf3f88a2d9f6346a7f6d3229bf59d4533c437
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/batch_norm_gather_stats_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API batch_norm_gather_stats {
+  using schema = ::std::tuple<at::Tensor,at::Tensor> (const at::Tensor &, const at::Tensor &, const at::Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, double, double, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::batch_norm_gather_stats")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "batch_norm_gather_stats(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count) -> (Tensor, Tensor)")
+  static ::std::tuple<at::Tensor,at::Tensor> call(const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, double momentum, double eps, int64_t count);
+  static ::std::tuple<at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, double momentum, double eps, int64_t count);
+};
+
+struct TORCH_API batch_norm_gather_stats_out {
+  using schema = ::std::tuple<at::Tensor &,at::Tensor &> (const at::Tensor &, const at::Tensor &, const at::Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, double, double, int64_t, at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::batch_norm_gather_stats")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "batch_norm_gather_stats.out(Tensor input, Tensor mean, Tensor invstd, Tensor? running_mean, Tensor? running_var, float momentum, float eps, int count, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))")
+  static ::std::tuple<at::Tensor &,at::Tensor &> call(const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, double momentum, double eps, int64_t count, at::Tensor & out0, at::Tensor & out1);
+  static ::std::tuple<at::Tensor &,at::Tensor &> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, double momentum, double eps, int64_t count, at::Tensor & out0, at::Tensor & out1);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_and_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_and_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..af66882c390b5aed1ad14bff4b0cce3692b18de0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_and_ops.h
@@ -0,0 +1,105 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API bitwise_and_Tensor_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bitwise_and")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bitwise_and.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+};
+
+struct TORCH_API bitwise_and_Scalar_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bitwise_and")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Scalar_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bitwise_and.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Scalar & other, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out);
+};
+
+struct TORCH_API bitwise_and_Scalar {
+  using schema = at::Tensor (const at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bitwise_and")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Scalar")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bitwise_and.Scalar(Tensor self, Scalar other) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Scalar & other);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other);
+};
+
+struct TORCH_API bitwise_and_Scalar_Tensor {
+  using schema = at::Tensor (const at::Scalar &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bitwise_and")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Scalar_Tensor")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bitwise_and.Scalar_Tensor(Scalar self, Tensor other) -> Tensor")
+  static at::Tensor call(const at::Scalar & self, const at::Tensor & other);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other);
+};
+
+struct TORCH_API bitwise_and_Tensor {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bitwise_and")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bitwise_and.Tensor(Tensor self, Tensor other) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & other);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other);
+};
+
+struct TORCH_API bitwise_and__Scalar {
+  using schema = at::Tensor & (at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bitwise_and_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Scalar")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bitwise_and_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, const at::Scalar & other);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other);
+};
+
+struct TORCH_API bitwise_and__Tensor {
+  using schema = at::Tensor & (at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bitwise_and_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bitwise_and_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, const at::Tensor & other);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other);
+};
+
+struct TORCH_API bitwise_and_Scalar_Tensor_out {
+  using schema = at::Tensor & (const at::Scalar &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bitwise_and")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Scalar_Tensor_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bitwise_and.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Scalar & self, const at::Tensor & other, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & other, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_left_shift_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_left_shift_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..5733f2659d9b6989746203c9dbf372a53f02c823
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_left_shift_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_bitwise_left_shift_Tensor : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & self, const at::Tensor & other);
+};
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bucketize_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bucketize_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3be8c9b37e4670f3bab27a318a70bf52536a0e9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bucketize_ops.h
@@ -0,0 +1,61 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API bucketize_Tensor {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, bool, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bucketize")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bucketize.Tensor(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & boundaries, bool out_int32, bool right);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & boundaries, bool out_int32, bool right);
+};
+
+struct TORCH_API bucketize_Tensor_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, bool, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bucketize")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bucketize.Tensor_out(Tensor self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & boundaries, bool out_int32, bool right, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & boundaries, bool out_int32, bool right, at::Tensor & out);
+};
+
+struct TORCH_API bucketize_Scalar {
+  using schema = at::Tensor (const at::Scalar &, const at::Tensor &, bool, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bucketize")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Scalar")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bucketize.Scalar(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False) -> Tensor")
+  static at::Tensor call(const at::Scalar & self, const at::Tensor & boundaries, bool out_int32, bool right);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & boundaries, bool out_int32, bool right);
+};
+
+struct TORCH_API bucketize_Scalar_out {
+  using schema = at::Tensor & (const at::Scalar &, const at::Tensor &, bool, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::bucketize")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Scalar_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "bucketize.Scalar_out(Scalar self, Tensor boundaries, *, bool out_int32=False, bool right=False, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Scalar & self, const at::Tensor & boundaries, bool out_int32, bool right, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & self, const at::Tensor & boundaries, bool out_int32, bool right, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cdist_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cdist_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ed688fee794747e93e232b5c32d59d20c9acf20
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cdist_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API cdist {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, double, c10::optional<int64_t>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::cdist")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & x1, const at::Tensor & x2, double p, c10::optional<int64_t> compute_mode);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x1, const at::Tensor & x2, double p, c10::optional<int64_t> compute_mode);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/col_indices_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/col_indices_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..536cf3af66c2d33d5bec95449491fe75fec28f52
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/col_indices_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API col_indices {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::col_indices")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "col_indices(Tensor(a) self) -> Tensor(a)")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diag.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diag.h
new file mode 100644
index 0000000000000000000000000000000000000000..743e10ccdbddbf0e9de78265e31672d8d4dcc5e0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diag.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/diag_ops.h>
+
+namespace at {
+
+
+// aten::diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & diag_out(at::Tensor & out, const at::Tensor & self, int64_t diagonal=0) {
+    return at::_ops::diag_out::call(self, diagonal, out);
+}
+// aten::diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & diag_outf(const at::Tensor & self, int64_t diagonal, at::Tensor & out) {
+    return at::_ops::diag_out::call(self, diagonal, out);
+}
+
+// aten::diag(Tensor self, int diagonal=0) -> Tensor
+inline at::Tensor diag(const at::Tensor & self, int64_t diagonal=0) {
+    return at::_ops::diag::call(self, diagonal);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diagonal_copy_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diagonal_copy_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..46489889de9e731e487078f8ea0742bc58d12a24
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diagonal_copy_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & diagonal_copy_out(at::Tensor & out, const at::Tensor & self, int64_t offset=0, int64_t dim1=0, int64_t dim2=1);
+TORCH_API at::Tensor & diagonal_copy_outf(const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/elu_backward_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/elu_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..7614c28971d57ad6ef15181805dbe18eec45e09b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/elu_backward_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API elu_backward_grad_input {
+  using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, const at::Scalar &, const at::Scalar &, bool, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::elu_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "grad_input")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "elu_backward.grad_input(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result, *, Tensor(a!) grad_input) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & grad_output, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, bool is_result, const at::Tensor & self_or_result, at::Tensor & grad_input);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, bool is_result, const at::Tensor & self_or_result, at::Tensor & grad_input);
+};
+
+struct TORCH_API elu_backward {
+  using schema = at::Tensor (const at::Tensor &, const at::Scalar &, const at::Scalar &, const at::Scalar &, bool, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::elu_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor")
+  static at::Tensor call(const at::Tensor & grad_output, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, bool is_result, const at::Tensor & self_or_result);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, bool is_result, const at::Tensor & self_or_result);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/eye_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/eye_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..38f15587a8ef70fb221392d51d220192b0ed50f6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/eye_meta_dispatch.h
@@ -0,0 +1,30 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor & eye_out(at::Tensor & out, int64_t n);
+TORCH_API at::Tensor & eye_outf(int64_t n, at::Tensor & out);
+TORCH_API at::Tensor & eye_symint_out(at::Tensor & out, c10::SymInt n);
+TORCH_API at::Tensor & eye_symint_outf(c10::SymInt n, at::Tensor & out);
+TORCH_API at::Tensor & eye_out(at::Tensor & out, int64_t n, int64_t m);
+TORCH_API at::Tensor & eye_outf(int64_t n, int64_t m, at::Tensor & out);
+TORCH_API at::Tensor & eye_symint_out(at::Tensor & out, c10::SymInt n, c10::SymInt m);
+TORCH_API at::Tensor & eye_symint_outf(c10::SymInt n, c10::SymInt m, at::Tensor & out);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fft2_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fft2_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..44d85e18e43149bda00a588ee76da97e11734f94
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fft2_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API fft_fft2 {
+  using schema = at::Tensor (const at::Tensor &, at::OptionalSymIntArrayRef, at::IntArrayRef, c10::optional<c10::string_view>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::fft_fft2")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "fft_fft2(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm);
+};
+
+struct TORCH_API fft_fft2_out {
+  using schema = at::Tensor & (const at::Tensor &, at::OptionalSymIntArrayRef, at::IntArrayRef, c10::optional<c10::string_view>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::fft_fft2")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "fft_fft2.out(Tensor self, SymInt[1]? s=None, int[1] dim=[-2,-1], str? norm=None, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/igammac_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/igammac_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa16a552675b499c3c2295b09f8f085a807e4803
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/igammac_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_igammac : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & self, const at::Tensor & other);
+};
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_coalesced_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_coalesced_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..4698de0b1d4da3a745a6a1bfc1d11d93da8b7a42
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_coalesced_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API bool is_coalesced_default(const at::Tensor & self);
+TORCH_API bool is_coalesced_sparse(const at::Tensor & self);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_norm_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_norm_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3684c2dde5e902f7cfd6c09844e66c4eebcce5fd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_norm_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor linalg_norm(const at::Tensor & self, const c10::optional<at::Scalar> & ord=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt);
+TORCH_API at::Tensor & linalg_norm_out(at::Tensor & out, const at::Tensor & self, const c10::optional<at::Scalar> & ord=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt);
+TORCH_API at::Tensor & linalg_norm_outf(const at::Tensor & self, const c10::optional<at::Scalar> & ord, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out);
+TORCH_API at::Tensor linalg_norm(const at::Tensor & self, c10::string_view ord, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt);
+TORCH_API at::Tensor & linalg_norm_out(at::Tensor & out, const at::Tensor & self, c10::string_view ord, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt);
+TORCH_API at::Tensor & linalg_norm_outf(const at::Tensor & self, c10::string_view ord, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lt_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lt_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..dce5c55d8f7891f3aeef763d7b5e78339a82b305
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lt_ops.h
@@ -0,0 +1,83 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API lt_Scalar_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::lt")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Scalar_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "lt.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Scalar & other, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other, at::Tensor & out);
+};
+
+struct TORCH_API lt_Scalar {
+  using schema = at::Tensor (const at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::lt")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Scalar")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "lt.Scalar(Tensor self, Scalar other) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Scalar & other);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & other);
+};
+
+struct TORCH_API lt_Tensor_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::lt")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "lt.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+};
+
+struct TORCH_API lt_Tensor {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::lt")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "lt.Tensor(Tensor self, Tensor other) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & other);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other);
+};
+
+struct TORCH_API lt__Scalar {
+  using schema = at::Tensor & (at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::lt_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Scalar")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "lt_.Scalar(Tensor(a!) self, Scalar other) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, const at::Scalar & other);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & other);
+};
+
+struct TORCH_API lt__Tensor {
+  using schema = at::Tensor & (at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::lt_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "lt_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, const at::Tensor & other);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & other);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_scatter_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_scatter_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9ea382ff78ccc5770288d1b949cc464cb3a3db6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_scatter_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API masked_scatter_ {
+  using schema = at::Tensor & (at::Tensor &, const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::masked_scatter_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "masked_scatter_(Tensor(a!) self, Tensor mask, Tensor source) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, const at::Tensor & mask, const at::Tensor & source);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & mask, const at::Tensor & source);
+};
+
+struct TORCH_API masked_scatter {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::masked_scatter")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source);
+};
+
+struct TORCH_API masked_scatter_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::masked_scatter")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "masked_scatter.out(Tensor self, Tensor mask, Tensor source, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c06dee8eb07a091054df33e776d9b1046efbf621
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> max(const at::Tensor & self, at::Dimname dim, bool keepdim=false);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> max_out(at::Tensor & max, at::Tensor & max_values, const at::Tensor & self, at::Dimname dim, bool keepdim=false);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> max_outf(const at::Tensor & self, at::Dimname dim, bool keepdim, at::Tensor & max, at::Tensor & max_values);
+TORCH_API at::Tensor max(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & max_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & max_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_unpool2d_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_unpool2d_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..181d07fe0b6c14146025bc53aba090d3862a4cbb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_unpool2d_cuda_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor max_unpool2d(const at::Tensor & self, const at::Tensor & indices, at::IntArrayRef output_size);
+TORCH_API at::Tensor max_unpool2d_symint(const at::Tensor & self, const at::Tensor & indices, c10::SymIntArrayRef output_size);
+TORCH_API at::Tensor & max_unpool2d_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & indices, at::IntArrayRef output_size);
+TORCH_API at::Tensor & max_unpool2d_outf(const at::Tensor & self, const at::Tensor & indices, at::IntArrayRef output_size, at::Tensor & out);
+TORCH_API at::Tensor & max_unpool2d_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & indices, c10::SymIntArrayRef output_size);
+TORCH_API at::Tensor & max_unpool2d_symint_outf(const at::Tensor & self, const at::Tensor & indices, c10::SymIntArrayRef output_size, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_unpool2d_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_unpool2d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..8255fc7c95260d71f74c4aed976127ce57609d7f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_unpool2d_native.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor max_unpooling2d_forward_cpu(const at::Tensor & self, const at::Tensor & indices, at::IntArrayRef output_size);
+TORCH_API at::Tensor & max_unpooling2d_forward_out_cpu(const at::Tensor & self, const at::Tensor & indices, at::IntArrayRef output_size, at::Tensor & out);
+TORCH_API at::Tensor max_unpooling2d_forward_cuda(const at::Tensor & self, const at::Tensor & indices, at::IntArrayRef output_size);
+TORCH_API at::Tensor & max_unpooling2d_forward_out_cuda(const at::Tensor & self, const at::Tensor & indices, at::IntArrayRef output_size, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_batch_norm_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_batch_norm_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..0eebfdbab373ff676f78cebd322af811c45de5f9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_batch_norm_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> miopen_batch_norm_out(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double exponential_average_factor, double epsilon, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_batch_norm(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double exponential_average_factor, double epsilon);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ed1d1f8a48c2d5aaa5603c8261c8342793d45f3
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API mkldnn_adaptive_avg_pool2d {
+  using schema = at::Tensor (const at::Tensor &, at::IntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::mkldnn_adaptive_avg_pool2d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, at::IntArrayRef output_size);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size);
+};
+
+struct TORCH_API mkldnn_adaptive_avg_pool2d_out {
+  using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::mkldnn_adaptive_avg_pool2d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "mkldnn_adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/msort_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/msort_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffa025ccad40952c4c798d6fcb6464274d986dcc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/msort_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor msort(const at::Tensor & self);
+TORCH_API at::Tensor & msort_out(const at::Tensor & self, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nan_to_num_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nan_to_num_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9d12033fc87e73a6f6417df56bdf446093fe812
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nan_to_num_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor nan_to_num(const at::Tensor & self, c10::optional<double> nan=c10::nullopt, c10::optional<double> posinf=c10::nullopt, c10::optional<double> neginf=c10::nullopt);
+TORCH_API at::Tensor & nan_to_num_(at::Tensor & self, c10::optional<double> nan=c10::nullopt, c10::optional<double> posinf=c10::nullopt, c10::optional<double> neginf=c10::nullopt);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/negative.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/negative.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ba0394665d5e06b3b4fc3e319eef7ddcacff4ab
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/negative.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/negative_ops.h>
+
+namespace at {
+
+
+// aten::negative(Tensor self) -> Tensor
+inline at::Tensor negative(const at::Tensor & self) {
+    return at::_ops::negative::call(self);
+}
+
+// aten::negative_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & negative_(at::Tensor & self) {
+    return at::_ops::negative_::call(self);
+}
+
+// aten::negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & negative_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::negative_out::call(self, out);
+}
+// aten::negative.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & negative_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::negative_out::call(self, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/new_empty_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/new_empty_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..448142a4dfae0486de6071cf83dbe3c071c7864c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/new_empty_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor new_empty_symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype={}, c10::optional<at::Layout> layout={}, c10::optional<at::Device> device={}, c10::optional<bool> pin_memory={});
+TORCH_API at::Tensor & new_empty_out_symint(const at::Tensor & self, c10::SymIntArrayRef size, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/prod_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/prod_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc0af66d5d8e574fab688e13ade6e84c76ff2590
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/prod_native.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/prod_meta.h>
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & prod_out(const at::Tensor & self, c10::optional<at::ScalarType> dtype, at::Tensor & out);
+TORCH_API at::Tensor prod(const at::Tensor & self, c10::optional<at::ScalarType> dtype=c10::nullopt);
+struct TORCH_API structured_prod_out : public at::meta::structured_prod_dim_int {
+void impl(const at::Tensor & self, int64_t dim, bool keepdim, c10::optional<at::ScalarType> dtype, const at::Tensor & out);
+};
+TORCH_API at::Tensor prod(const at::Tensor & self, at::Dimname dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt);
+TORCH_API at::Tensor & prod_out(const at::Tensor & self, at::Dimname dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/put_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/put_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d52570b662c012d4b85b2ed292d7ba1abe029f41
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/put_meta_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor & put_(at::Tensor & self, const at::Tensor & index, const at::Tensor & source, bool accumulate=false);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/quantized_max_pool2d_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/quantized_max_pool2d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..69089ab49000a80b5012e8a0f0153d50254019b9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/quantized_max_pool2d_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API quantized_max_pool2d {
+  using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::quantized_max_pool2d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "quantized_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode);
+};
+
+struct TORCH_API quantized_max_pool2d_out {
+  using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::quantized_max_pool2d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "quantized_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/real_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/real_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..75b20319e2da6e700ec529d77e70bb6e57173c41
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/real_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor real(const at::Tensor & self);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e16a3d92e05fb9ab64fa472274a8fa86132560bb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d_cpu_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor replication_pad1d(const at::Tensor & self, at::IntArrayRef padding);
+TORCH_API at::Tensor replication_pad1d_symint(const at::Tensor & self, c10::SymIntArrayRef padding);
+TORCH_API at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding);
+TORCH_API at::Tensor & replication_pad1d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out);
+TORCH_API at::Tensor & replication_pad1d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding);
+TORCH_API at::Tensor & replication_pad1d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/row_indices_copy_compositeexplicitautogradnonfunctional_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/row_indices_copy_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f21aa45924283f0ecf042b226317b9830237f7a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/row_indices_copy_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor row_indices_copy(const at::Tensor & self);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rrelu_with_noise.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rrelu_with_noise.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6da561fce188ce2c5b0bddab716b17fe5f6fa01
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rrelu_with_noise.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/rrelu_with_noise_ops.h>
+
+namespace at {
+
+
+// aten::rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & rrelu_with_noise_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower=0.125, const at::Scalar & upper=0.3333333333333333, bool training=false, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::rrelu_with_noise_out::call(self, noise, lower, upper, training, generator, out);
+}
+// aten::rrelu_with_noise.out(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & rrelu_with_noise_outf(const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower, const at::Scalar & upper, bool training, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::rrelu_with_noise_out::call(self, noise, lower, upper, training, generator, out);
+}
+
+// aten::rrelu_with_noise(Tensor self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor
+inline at::Tensor rrelu_with_noise(const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower=0.125, const at::Scalar & upper=0.3333333333333333, bool training=false, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::rrelu_with_noise::call(self, noise, lower, upper, training, generator);
+}
+
+// aten::rrelu_with_noise_(Tensor(a!) self, Tensor noise, Scalar lower=0.125, Scalar upper=0.3333333333333333, bool training=False, Generator? generator=None) -> Tensor(a!)
+inline at::Tensor & rrelu_with_noise_(at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower=0.125, const at::Scalar & upper=0.3333333333333333, bool training=false, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::rrelu_with_noise_::call(self, noise, lower, upper, training, generator);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/scatter_reduce_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/scatter_reduce_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2add648340654ab66e33c3e5775ee3e6a080966b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/scatter_reduce_cuda_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor scatter_reduce(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true);
+TORCH_API at::Tensor & scatter_reduce_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true);
+TORCH_API at::Tensor & scatter_reduce_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self, at::Tensor & out);
+TORCH_API at::Tensor & scatter_reduce_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce, bool include_self=true);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_backward.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..be51f278ca681417c5f9eee0acbd5b3401855105
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_backward.h
@@ -0,0 +1,91 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/select_backward_ops.h>
+
+namespace at {
+
+
+// aten::select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
+inline at::Tensor select_backward(const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t index) {
+    return at::_ops::select_backward::call(grad_output, c10::fromIntArrayRefSlow(input_sizes), dim, index);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor select_backward(const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t index) {
+    return at::_ops::select_backward::call(grad_output, c10::fromIntArrayRefSlow(input_sizes), dim, index);
+  }
+}
+
+// aten::select_backward(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index) -> Tensor
+inline at::Tensor select_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index) {
+    return at::_ops::select_backward::call(grad_output, input_sizes, dim, index);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor select_backward(const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index) {
+    return at::_ops::select_backward::call(grad_output, input_sizes, dim, index);
+  }
+}
+
+// aten::select_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & select_backward_out(at::Tensor & out, const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t index) {
+    return at::_ops::select_backward_out::call(grad_output, c10::fromIntArrayRefSlow(input_sizes), dim, index, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & select_backward_out(at::Tensor & out, const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t index) {
+    return at::_ops::select_backward_out::call(grad_output, c10::fromIntArrayRefSlow(input_sizes), dim, index, out);
+  }
+}
+
+// aten::select_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & select_backward_outf(const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t index, at::Tensor & out) {
+    return at::_ops::select_backward_out::call(grad_output, c10::fromIntArrayRefSlow(input_sizes), dim, index, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & select_backward_outf(const at::Tensor & grad_output, at::IntArrayRef input_sizes, int64_t dim, int64_t index, at::Tensor & out) {
+    return at::_ops::select_backward_out::call(grad_output, c10::fromIntArrayRefSlow(input_sizes), dim, index, out);
+  }
+}
+
+// aten::select_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & select_backward_symint_out(at::Tensor & out, const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index) {
+    return at::_ops::select_backward_out::call(grad_output, input_sizes, dim, index, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & select_backward_out(at::Tensor & out, const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index) {
+    return at::_ops::select_backward_out::call(grad_output, input_sizes, dim, index, out);
+  }
+}
+
+// aten::select_backward.out(Tensor grad_output, SymInt[] input_sizes, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & select_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index, at::Tensor & out) {
+    return at::_ops::select_backward_out::call(grad_output, input_sizes, dim, index, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & select_backward_outf(const at::Tensor & grad_output, c10::SymIntArrayRef input_sizes, int64_t dim, c10::SymInt index, at::Tensor & out) {
+    return at::_ops::select_backward_out::call(grad_output, input_sizes, dim, index, out);
+  }
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/set_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/set_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb7daf678380c9f3efa3a3c1e3b7b938fbbf2e2d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/set_cuda_dispatch.h
@@ -0,0 +1,27 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor & set_(at::Tensor & self, at::Storage source);
+TORCH_API at::Tensor & set_(at::Tensor & self, at::Storage source, int64_t storage_offset, at::IntArrayRef size, at::IntArrayRef stride={});
+TORCH_API at::Tensor & set__symint(at::Tensor & self, at::Storage source, c10::SymInt storage_offset, c10::SymIntArrayRef size, c10::SymIntArrayRef stride={});
+TORCH_API at::Tensor & set_(at::Tensor & self, const at::Tensor & source);
+TORCH_API at::Tensor & set_(at::Tensor & self);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/smm_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/smm_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f734516fc449a327784cc677c8e64f39ef13744
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/smm_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API smm {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::smm")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "smm(Tensor self, Tensor mat2) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & mat2);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mat2);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softplus_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softplus_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..76674116588fd5dd0d9488cbe83fb08199b479e6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softplus_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor softplus(const at::Tensor & self, const at::Scalar & beta=1, const at::Scalar & threshold=20);
+TORCH_API at::Tensor & softplus_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & beta=1, const at::Scalar & threshold=20);
+TORCH_API at::Tensor & softplus_outf(const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softshrink_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softshrink_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..46752d2c17034617d6c46f3767e9a1b47d088d1a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softshrink_meta_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor softshrink(const at::Tensor & self, const at::Scalar & lambd=0.5);
+TORCH_API at::Tensor & softshrink_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & lambd=0.5);
+TORCH_API at::Tensor & softshrink_outf(const at::Tensor & self, const at::Scalar & lambd, at::Tensor & out);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sparse_resize.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sparse_resize.h
new file mode 100644
index 0000000000000000000000000000000000000000..84b9eccd266df12f8b8f0f6dd14f94a40274ff40
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sparse_resize.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/sparse_resize_ops.h>
+
+namespace at {
+
+
+// aten::sparse_resize.out(Tensor self, int[] size, int sparse_dim, int dense_dim, *, Tensor(a!) out) -> Tensor(a!)
+inline const at::Tensor & sparse_resize_out(const at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) {
+    return at::_ops::sparse_resize_out::call(self, size, sparse_dim, dense_dim, out);
+}
+// aten::sparse_resize.out(Tensor self, int[] size, int sparse_dim, int dense_dim, *, Tensor(a!) out) -> Tensor(a!)
+inline const at::Tensor & sparse_resize_outf(const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim, const at::Tensor & out) {
+    return at::_ops::sparse_resize_out::call(self, size, sparse_dim, dense_dim, out);
+}
+
+// aten::sparse_resize(Tensor self, int[] size, int sparse_dim, int dense_dim) -> Tensor
+inline at::Tensor sparse_resize(const at::Tensor & self, at::IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) {
+    return at::_ops::sparse_resize::call(self, size, sparse_dim, dense_dim);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_bessel_y0_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_bessel_y0_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..70f7758ce1882bc3cf8a46f328c0b796293dd086
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_bessel_y0_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor special_bessel_y0(const at::Tensor & self);
+TORCH_API at::Tensor & special_bessel_y0_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & special_bessel_y0_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_bessel_y0_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_bessel_y0_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..55e11d5a9bdb8706d75c9e9f5d07ec913645ef3b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_bessel_y0_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API special_bessel_y0 {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_bessel_y0")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_bessel_y0(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API special_bessel_y0_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_bessel_y0")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..02389dcff33a02635f2db0ca97214b801c0e35d4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_meta_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor special_chebyshev_polynomial_v(const at::Tensor & x, const at::Tensor & n);
+TORCH_API at::Tensor & special_chebyshev_polynomial_v_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n);
+TORCH_API at::Tensor & special_chebyshev_polynomial_v_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..407ac798b58b395bc2c407c61b2bb34d666a843a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_native.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/special_chebyshev_polynomial_v_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_special_chebyshev_polynomial_v_out : public at::meta::structured_special_chebyshev_polynomial_v {
+void impl(const at::Tensor & x, const at::Tensor & n, const at::Tensor & out);
+};
+TORCH_API at::Tensor special_chebyshev_polynomial_v(const at::Scalar & x, const at::Tensor & n);
+TORCH_API at::Tensor & special_chebyshev_polynomial_v_out(const at::Scalar & x, const at::Tensor & n, at::Tensor & out);
+TORCH_API at::Tensor special_chebyshev_polynomial_v(const at::Tensor & x, const at::Scalar & n);
+TORCH_API at::Tensor & special_chebyshev_polynomial_v_out(const at::Tensor & x, const at::Scalar & n, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_hermite_polynomial_he_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_hermite_polynomial_he_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..34814ebc4a250a89cd7a3798c94e5891cf3d5c6d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_hermite_polynomial_he_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor special_hermite_polynomial_he(const at::Tensor & x, const at::Tensor & n);
+TORCH_API at::Tensor & special_hermite_polynomial_he_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n);
+TORCH_API at::Tensor & special_hermite_polynomial_he_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_modified_bessel_i1_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_modified_bessel_i1_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..178225d062c9cde23f22d4c53cbe47d889924228
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_modified_bessel_i1_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor special_modified_bessel_i1(const at::Tensor & self);
+TORCH_API at::Tensor & special_modified_bessel_i1_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & special_modified_bessel_i1_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_scaled_modified_bessel_k1_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_scaled_modified_bessel_k1_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e40ed94c353a457eedcfcc5cc2b77ecc87cfbf19
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_scaled_modified_bessel_k1_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor special_scaled_modified_bessel_k1(const at::Tensor & x);
+TORCH_API at::Tensor & special_scaled_modified_bessel_k1_out(at::Tensor & out, const at::Tensor & x);
+TORCH_API at::Tensor & special_scaled_modified_bessel_k1_outf(const at::Tensor & x, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_t_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_t_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..1de0dad4f3a1508a194fdb35e2ea068e3850a7db
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_t_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_special_shifted_chebyshev_polynomial_t : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & x, const at::Tensor & n);
+};
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/split_copy_compositeexplicitautogradnonfunctional_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/split_copy_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c5b8b5fa81b9aeddcc134e21adbe11fa11d2f8e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/split_copy_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API ::std::vector<at::Tensor> split_copy(const at::Tensor & self, int64_t split_size, int64_t dim=0);
+TORCH_API ::std::vector<at::Tensor> split_copy_symint(const at::Tensor & self, c10::SymInt split_size, int64_t dim=0);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tanh_backward_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tanh_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbd0e4502aa39ac5c9f4c5ae6658c01610973f2e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tanh_backward_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/tanh_backward_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_tanh_backward_out : public at::meta::structured_tanh_backward {
+void impl(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & grad_input);
+};
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unfold_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unfold_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..1eea477a5ca8ca84dba9c1fac836444c34a42250
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unfold_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor unfold(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_nearest1d_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_nearest1d_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..d0479060dc334daff669f4995a17275d4a4b05da
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_nearest1d_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_upsample_nearest1d : public at::impl::MetaBase {
+    
+    
+    void meta(const at::Tensor & self, at::ArrayRef<int64_t> output_size, c10::optional<double> scales);
+};
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_nearest2d_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_nearest2d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b35319b6b3a9eed962f1b5adc3e1a6845a18f17
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_nearest2d_native.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/upsample_nearest2d_meta.h>
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor upsample_nearest2d(const at::Tensor & input, at::OptionalIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors);
+struct TORCH_API structured_upsample_nearest2d_out_cpu : public at::meta::structured_upsample_nearest2d {
+void impl(const at::Tensor & self, at::ArrayRef<int64_t> output_size, c10::optional<double> scales_h, c10::optional<double> scales_w, const at::Tensor & out);
+};
+struct TORCH_API structured_upsample_nearest2d_out_cuda : public at::meta::structured_upsample_nearest2d {
+void impl(const at::Tensor & self, at::ArrayRef<int64_t> output_size, c10::optional<double> scales_h, c10::optional<double> scales_w, const at::Tensor & out);
+};
+TORCH_API at::Tensor upsample_nearest2d_quantized_cpu(const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_trilinear3d_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_trilinear3d_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..dae7711ba7a43506d4db9fec94a932baf302dd6f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_trilinear3d_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_upsample_trilinear3d : public at::impl::MetaBase {
+    
+    
+    void meta(const at::Tensor & self, at::ArrayRef<int64_t> output_size, bool align_corners, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w);
+};
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/view_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/view_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a03452aa9fd2a975250c202e431fe0f21012f286
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/view_meta_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor view(const at::Tensor & self, at::IntArrayRef size);
+TORCH_API at::Tensor view_symint(const at::Tensor & self, c10::SymIntArrayRef size);
+
+} // namespace meta
+} // namespace at