diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..dafda977b01f8790bb712603ce7af3259d1f1573
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_cuda_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor _adaptive_avg_pool2d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API at::Tensor _adaptive_avg_pool2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_amp_foreach_non_finite_check_and_unscale_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_amp_foreach_non_finite_check_and_unscale_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ecaf86eabb9f71f018ff84639425431ad905566e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_amp_foreach_non_finite_check_and_unscale_cuda_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API void _amp_foreach_non_finite_check_and_unscale_(at::TensorList self, at::Tensor & found_inf, const at::Tensor & inv_scale);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6219b9550ccd485533469469ff4524bd6cd45f7e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & _embedding_bag_dense_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
+TORCH_API at::Tensor & _embedding_bag_dense_backward_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx, at::Tensor & out);
+TORCH_API at::Tensor & _embedding_bag_dense_backward_symint_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
+TORCH_API at::Tensor & _embedding_bag_dense_backward_symint_outf(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fill_mem_eff_dropout_mask_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fill_mem_eff_dropout_mask_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..065bbe0fdbfab8ee4f648cc0826dc2f32eaaee16
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fill_mem_eff_dropout_mask_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _fill_mem_eff_dropout_mask_ {
+  using schema = at::Tensor & (at::Tensor &, double, int64_t, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_fill_mem_eff_dropout_mask_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_fill_mem_eff_dropout_mask_(Tensor(a!) self, float dropout_p, int seed, int offset) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, double dropout_p, int64_t seed, int64_t offset);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double dropout_p, int64_t seed, int64_t offset);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_abs_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_abs_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc68fd6cb6d35ddb385db8aa80879d45dddc5b28
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_abs_cpu_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API ::std::vector<at::Tensor> _foreach_abs(at::TensorList self);
+TORCH_API void _foreach_abs_(at::TensorList self);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log1p_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log1p_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e159812b0ecdee5cbbfe7ee8b75555d67468ff72
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log1p_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API void _foreach_log1p_out(at::TensorList out, at::TensorList self);
+TORCH_API void _foreach_log1p_outf(at::TensorList self, at::TensorList out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..de31a883889cfc20c99365a52d60e2850c2ef73c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _foreach_log {
+  using schema = ::std::vector<at::Tensor> (at::TensorList);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_log")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_log(Tensor[] self) -> Tensor[]")
+  static ::std::vector<at::Tensor> call(at::TensorList self);
+  static ::std::vector<at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self);
+};
+
+struct TORCH_API _foreach_log_ {
+  using schema = void (at::TensorList);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_log_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_log_(Tensor(a!)[] self) -> ()")
+  static void call(at::TensorList self);
+  static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self);
+};
+
+struct TORCH_API _foreach_log_out {
+  using schema = void (at::TensorList, at::TensorList);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_log")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_log.out(Tensor[] self, *, Tensor(a!)[] out) -> ()")
+  static void call(at::TensorList self, at::TensorList out);
+  static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_functional_assert_scalar_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_functional_assert_scalar_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b400fb1d091342135f9e1f5a2d97a3d4abfa5c8
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_functional_assert_scalar_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor _functional_assert_scalar(const at::Scalar & self, c10::string_view assert_msg, const at::Tensor & dep_token);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_grid_sampler_2d_cpu_fallback_backward_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_grid_sampler_2d_cpu_fallback_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..aea23dc1929a5ea91a2b42b95225def4d47a6bb0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_grid_sampler_2d_cpu_fallback_backward_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _grid_sampler_2d_cpu_fallback_backward {
+  using schema = ::std::tuple<at::Tensor,at::Tensor> (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, int64_t, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_grid_sampler_2d_cpu_fallback_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_grid_sampler_2d_cpu_fallback_backward(Tensor grad_output, Tensor input, Tensor grid, int interpolation_mode, int padding_mode, bool align_corners) -> (Tensor, Tensor)")
+  static ::std::tuple<at::Tensor,at::Tensor> call(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+  static ::std::tuple<at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & grid, int64_t interpolation_mode, int64_t padding_mode, bool align_corners);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..84116ea8fa2765810d6c28f8bac75d0edc391501
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_log_softmax_backward_data_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured__log_softmax_backward_data : public at::impl::MetaBase {
+    
+    
+    void meta(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, at::ScalarType input_dtype);
+};
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_make_per_channel_quantized_tensor_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_make_per_channel_quantized_tensor_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d03963449be645459df4ef6ce709c7b97aa6739
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_make_per_channel_quantized_tensor_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _make_per_channel_quantized_tensor {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_make_per_channel_quantized_tensor")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_make_per_channel_quantized_tensor(Tensor self, Tensor scale, Tensor zero_point, int axis) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis);
+};
+
+struct TORCH_API _make_per_channel_quantized_tensor_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_make_per_channel_quantized_tensor")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_make_per_channel_quantized_tensor.out(Tensor self, Tensor scale, Tensor zero_point, int axis, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_mkldnn_transpose_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_mkldnn_transpose_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9469088ab234157b1dc198c9e19363895a124312
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_mkldnn_transpose_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & _mkldnn_transpose_out(at::Tensor & out, const at::Tensor & self, int64_t dim0, int64_t dim1);
+TORCH_API at::Tensor & _mkldnn_transpose_outf(const at::Tensor & self, int64_t dim0, int64_t dim1, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_native_batch_norm_legit_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_native_batch_norm_legit_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ed42a05a144652547de78357edaee1224ca1229
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_native_batch_norm_legit_native.h
@@ -0,0 +1,31 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _native_batch_norm_legit_functional(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const at::Tensor & running_mean, const at::Tensor & running_var, bool training, double momentum, double eps);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _batch_norm_legit_cpu_out(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _batch_norm_legit_cpu(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _batch_norm_legit_cuda_out(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _batch_norm_legit_cuda(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _mkldnn_batch_norm_legit(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, at::Tensor & running_mean, at::Tensor & running_var, bool training, double momentum, double eps);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _batch_norm_legit_no_stats_cpu(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, bool training, double momentum, double eps);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _batch_norm_legit_no_stats_cpu_out(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _batch_norm_legit_no_stats_cuda(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, bool training, double momentum, double eps);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _batch_norm_legit_no_stats_cuda_out(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, bool training, double momentum, double eps, at::Tensor & out, at::Tensor & save_mean, at::Tensor & save_invstd);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _mkldnn_batch_norm_legit_no_stats(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, bool training, double momentum, double eps);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_new_zeros_with_same_feature_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_new_zeros_with_same_feature_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..9253c827b2076e5ccc78cba6c45b235e89d0672b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_new_zeros_with_same_feature_meta.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_new_zeros_with_same_feature_meta_ops.h>
+
+namespace at {
+
+
+// aten::_new_zeros_with_same_feature_meta(Tensor self, Tensor other, *, int self_num_batch_dims=0) -> Tensor
+inline at::Tensor _new_zeros_with_same_feature_meta(const at::Tensor & self, const at::Tensor & other, int64_t self_num_batch_dims=0) {
+    return at::_ops::_new_zeros_with_same_feature_meta::call(self, other, self_num_batch_dims);
+}
+
+// aten::_new_zeros_with_same_feature_meta.out(Tensor self, Tensor other, *, int self_num_batch_dims=0, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _new_zeros_with_same_feature_meta_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, int64_t self_num_batch_dims=0) {
+    return at::_ops::_new_zeros_with_same_feature_meta_out::call(self, other, self_num_batch_dims, out);
+}
+// aten::_new_zeros_with_same_feature_meta.out(Tensor self, Tensor other, *, int self_num_batch_dims=0, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _new_zeros_with_same_feature_meta_outf(const at::Tensor & self, const at::Tensor & other, int64_t self_num_batch_dims, at::Tensor & out) {
+    return at::_ops::_new_zeros_with_same_feature_meta_out::call(self, other, self_num_batch_dims, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_scaled_dot_product_attention_math.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_scaled_dot_product_attention_math.h
new file mode 100644
index 0000000000000000000000000000000000000000..ea245d0ca8fc22dd6c53dcf3db9955bfcb677517
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_scaled_dot_product_attention_math.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_scaled_dot_product_attention_math_ops.h>
+
+namespace at {
+
+
+// aten::_scaled_dot_product_attention_math(Tensor query, Tensor key, Tensor value, Tensor? attn_mask=None, float dropout_p=0.0, bool is_causal=False, Tensor? dropout_mask=None, *, float? scale=None) -> (Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor> _scaled_dot_product_attention_math(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & attn_mask={}, double dropout_p=0.0, bool is_causal=false, const c10::optional<at::Tensor> & dropout_mask={}, c10::optional<double> scale=c10::nullopt) {
+    return at::_ops::_scaled_dot_product_attention_math::call(query, key, value, attn_mask, dropout_p, is_causal, dropout_mask, scale);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_csc_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_csc_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c7757eb81d16a5fb1e8d3b4fa3ee1b596ebd9d2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_csc_cpu_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor _to_sparse_csc(const at::Tensor & self, c10::optional<int64_t> dense_dim=c10::nullopt);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_use_cudnn_ctc_loss.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_use_cudnn_ctc_loss.h
new file mode 100644
index 0000000000000000000000000000000000000000..1998fd099df5b5aa248d08d82f320843ab3daa5b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_use_cudnn_ctc_loss.h
@@ -0,0 +1,35 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_use_cudnn_ctc_loss_ops.h>
+
+namespace at {
+
+
+// aten::_use_cudnn_ctc_loss(Tensor log_probs, Tensor targets, int[] input_lengths, int[] target_lengths, int blank) -> bool
+inline bool _use_cudnn_ctc_loss(const at::Tensor & log_probs, const at::Tensor & targets, at::IntArrayRef input_lengths, at::IntArrayRef target_lengths, int64_t blank) {
+    return at::_ops::_use_cudnn_ctc_loss::call(log_probs, targets, input_lengths, target_lengths, blank);
+}
+
+// aten::_use_cudnn_ctc_loss.Tensor(Tensor log_probs, Tensor targets, Tensor input_lengths, Tensor target_lengths, int blank) -> bool
+inline bool _use_cudnn_ctc_loss(const at::Tensor & log_probs, const at::Tensor & targets, const at::Tensor & input_lengths, const at::Tensor & target_lengths, int64_t blank) {
+    return at::_ops::_use_cudnn_ctc_loss_Tensor::call(log_probs, targets, input_lengths, target_lengths, blank);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_compressed_sparse_indices_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_compressed_sparse_indices_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..66b8b82bfe77734dfd0ac224a01cd2fbdbd06717
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_compressed_sparse_indices_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API void _validate_compressed_sparse_indices_cpu(bool is_crow, const at::Tensor & compressed_idx, const at::Tensor & plain_idx, int64_t cdim, int64_t dim, int64_t nnz);
+TORCH_API void _validate_compressed_sparse_indices_cuda(bool is_crow, const at::Tensor & compressed_idx, const at::Tensor & plain_idx, int64_t cdim, int64_t dim, int64_t nnz);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_sparse_bsc_tensor_args_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_sparse_bsc_tensor_args_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..5374970a57e8f347fa712f2a0f2059189be08378
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_sparse_bsc_tensor_args_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _validate_sparse_bsc_tensor_args {
+  using schema = void (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::IntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_validate_sparse_bsc_tensor_args")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_validate_sparse_bsc_tensor_args(Tensor ccol_indices, Tensor row_indices, Tensor values, int[] size) -> ()")
+  static void call(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size);
+  static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/abs_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/abs_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..00e5a7b1c264ce43bca5e01ecbe88a563d14c8e0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/abs_cuda_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor & abs_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & abs_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9eefa559e3eadca55dad3f974a90f2ee1ad4aba7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor adaptive_avg_pool3d(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API at::Tensor adaptive_avg_pool3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_compositeexplicitautogradnonfunctional_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d5ec49e102c5aad2f38169383f551aaf4ec0e8f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_max_pool2d_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool2d(const at::Tensor & self, at::IntArrayRef output_size);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/alpha_dropout_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/alpha_dropout_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..e24b76c96b8ac3002be6bdc2204a1ec240f8adde
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/alpha_dropout_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API alpha_dropout {
+  using schema = at::Tensor (const at::Tensor &, double, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::alpha_dropout")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "alpha_dropout(Tensor input, float p, bool train) -> Tensor")
+  static at::Tensor call(const at::Tensor & input, double p, bool train);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, double p, bool train);
+};
+
+struct TORCH_API alpha_dropout_ {
+  using schema = at::Tensor & (at::Tensor &, double, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::alpha_dropout_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "alpha_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, double p, bool train);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double p, bool train);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/amax.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/amax.h
new file mode 100644
index 0000000000000000000000000000000000000000..193d377398c84884a9691bcb98520948b78b13fb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/amax.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/amax_ops.h>
+
+namespace at {
+
+
+// aten::amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
+inline at::Tensor amax(const at::Tensor & self, at::IntArrayRef dim={}, bool keepdim=false) {
+    return at::_ops::amax::call(self, dim, keepdim);
+}
+
+// aten::amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & amax_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim={}, bool keepdim=false) {
+    return at::_ops::amax_out::call(self, dim, keepdim, out);
+}
+// aten::amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & amax_outf(const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out) {
+    return at::_ops::amax_out::call(self, dim, keepdim, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ceil_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ceil_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf57f017c020e8183907662dd305e028dffb198a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ceil_cpu_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor ceil(const at::Tensor & self);
+TORCH_API at::Tensor & ceil_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & ceil_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & ceil_(at::Tensor & self);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/complex_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/complex_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..85947adb6e2a6ae1395d78efcd786ad35e894123
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/complex_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API complex {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::complex")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "complex(Tensor real, Tensor imag) -> Tensor")
+  static at::Tensor call(const at::Tensor & real, const at::Tensor & imag);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & real, const at::Tensor & imag);
+};
+
+struct TORCH_API complex_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::complex")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "complex.out(Tensor real, Tensor imag, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & real, const at::Tensor & imag, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & real, const at::Tensor & imag, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/copy_sparse_to_sparse_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/copy_sparse_to_sparse_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca037317e17597b98d5beab0a3b3289a81de2a86
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/copy_sparse_to_sparse_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API copy_sparse_to_sparse_ {
+  using schema = at::Tensor & (at::Tensor &, const at::Tensor &, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::copy_sparse_to_sparse_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "copy_sparse_to_sparse_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, const at::Tensor & src, bool non_blocking);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Tensor & src, bool non_blocking);
+};
+
+struct TORCH_API copy_sparse_to_sparse_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::copy_sparse_to_sparse")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "copy_sparse_to_sparse.out(Tensor self, Tensor src, bool non_blocking=False, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & src, bool non_blocking, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, bool non_blocking, at::Tensor & out);
+};
+
+struct TORCH_API copy_sparse_to_sparse {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::copy_sparse_to_sparse")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "copy_sparse_to_sparse(Tensor self, Tensor src, bool non_blocking=False) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & src, bool non_blocking);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & src, bool non_blocking);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8423d3881dfbebb212c422f4e66484bd64d7e3e2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & cudnn_grid_sampler_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & grid);
+TORCH_API at::Tensor & cudnn_grid_sampler_outf(const at::Tensor & self, const at::Tensor & grid, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/detach_copy.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/detach_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd1775787f9b9d1ea8b0961025d6878e46d72b10
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/detach_copy.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/detach_copy_ops.h>
+
+namespace at {
+
+
+// aten::detach_copy(Tensor self) -> Tensor
+inline at::Tensor detach_copy(const at::Tensor & self) {
+    return at::_ops::detach_copy::call(self);
+}
+
+// aten::detach_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & detach_copy_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::detach_copy_out::call(self, out);
+}
+// aten::detach_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & detach_copy_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::detach_copy_out::call(self, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/expand_as.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/expand_as.h
new file mode 100644
index 0000000000000000000000000000000000000000..173787aba901b457c5748693b770d61e28c239e9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/expand_as.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/expand_as_ops.h>
+
+namespace at {
+
+
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/feature_dropout_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/feature_dropout_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..687d954554f7971f7e319c7840c9ea9ff1845db9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/feature_dropout_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API feature_dropout {
+  using schema = at::Tensor (const at::Tensor &, double, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::feature_dropout")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "feature_dropout(Tensor input, float p, bool train) -> Tensor")
+  static at::Tensor call(const at::Tensor & input, double p, bool train);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, double p, bool train);
+};
+
+struct TORCH_API feature_dropout_ {
+  using schema = at::Tensor & (at::Tensor &, double, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::feature_dropout_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "feature_dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, double p, bool train);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, double p, bool train);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_irfft_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_irfft_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9036a46cb1f80e39f741ad7e3160cecc4615935
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_irfft_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor fft_irfft(const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt);
+TORCH_API at::Tensor fft_irfft_symint(const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt);
+TORCH_API at::Tensor & fft_irfft_out(at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt);
+TORCH_API at::Tensor & fft_irfft_outf(const at::Tensor & self, c10::optional<int64_t> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out);
+TORCH_API at::Tensor & fft_irfft_symint_out(at::Tensor & out, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt);
+TORCH_API at::Tensor & fft_irfft_symint_outf(const at::Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gelu.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gelu.h
new file mode 100644
index 0000000000000000000000000000000000000000..490b8e1dac18790f0b6bff03389aea7b97668390
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gelu.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/gelu_ops.h>
+
+namespace at {
+
+
+// aten::gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & gelu_out(at::Tensor & out, const at::Tensor & self, c10::string_view approximate="none") {
+    return at::_ops::gelu_out::call(self, approximate, out);
+}
+// aten::gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & gelu_outf(const at::Tensor & self, c10::string_view approximate, at::Tensor & out) {
+    return at::_ops::gelu_out::call(self, approximate, out);
+}
+
+// aten::gelu_(Tensor(a!) self, *, str approximate='none') -> Tensor(a!)
+inline at::Tensor & gelu_(at::Tensor & self, c10::string_view approximate="none") {
+    return at::_ops::gelu_::call(self, approximate);
+}
+
+// aten::gelu(Tensor self, *, str approximate='none') -> Tensor
+inline at::Tensor gelu(const at::Tensor & self, c10::string_view approximate="none") {
+    return at::_ops::gelu::call(self, approximate);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gelu_backward_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gelu_backward_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf3f261dcd2c9e65f308e42572ccfe7977b7dcdc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gelu_backward_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor gelu_backward(const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate="none");
+TORCH_API at::Tensor & gelu_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate="none");
+TORCH_API at::Tensor & gelu_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate, at::Tensor & grad_input);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_fill.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_fill.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2df0dbc43767cd06ec424b3f516dcfd691b3f6b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_fill.h
@@ -0,0 +1,63 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/index_fill_ops.h>
+
+namespace at {
+
+
+// aten::index_fill.int_Scalar(Tensor self, int dim, Tensor index, Scalar value) -> Tensor
+inline at::Tensor index_fill(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value) {
+    return at::_ops::index_fill_int_Scalar::call(self, dim, index, value);
+}
+
+// aten::index_fill.int_Tensor(Tensor self, int dim, Tensor index, Tensor value) -> Tensor
+inline at::Tensor index_fill(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & value) {
+    return at::_ops::index_fill_int_Tensor::call(self, dim, index, value);
+}
+
+// aten::index_fill.Dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor
+inline at::Tensor index_fill(const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Scalar & value) {
+    return at::_ops::index_fill_Dimname_Scalar::call(self, dim, index, value);
+}
+
+// aten::index_fill.Dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor
+inline at::Tensor index_fill(const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & value) {
+    return at::_ops::index_fill_Dimname_Tensor::call(self, dim, index, value);
+}
+
+// aten::index_fill.int_Scalar_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & index_fill_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value) {
+    return at::_ops::index_fill_int_Scalar_out::call(self, dim, index, value, out);
+}
+// aten::index_fill.int_Scalar_out(Tensor self, int dim, Tensor index, Scalar value, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & index_fill_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, at::Tensor & out) {
+    return at::_ops::index_fill_int_Scalar_out::call(self, dim, index, value, out);
+}
+
+// aten::index_fill.int_Tensor_out(Tensor self, int dim, Tensor index, Tensor value, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & index_fill_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & value) {
+    return at::_ops::index_fill_int_Tensor_out::call(self, dim, index, value, out);
+}
+// aten::index_fill.int_Tensor_out(Tensor self, int dim, Tensor index, Tensor value, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & index_fill_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & value, at::Tensor & out) {
+    return at::_ops::index_fill_int_Tensor_out::call(self, dim, index, value, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_conj_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_conj_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..69c9aeeea15aaf7bb175f84cdfe9e55443da7a4d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_conj_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API bool is_conj(const at::Tensor & self);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/isinf_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/isinf_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..573d0afbce5fcc21f8f8c4b985ab6ab4c55f657c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/isinf_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor isinf(const at::Tensor & self);
+TORCH_API at::Tensor & isinf_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & isinf_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/leaky_relu_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/leaky_relu_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd7918ccbd17e4bb3da31aa151876c624007634f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/leaky_relu_native.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/leaky_relu_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_leaky_relu_out : public at::meta::structured_leaky_relu {
+void impl(const at::Tensor & self, const at::Scalar & negative_slope, const at::Tensor & out);
+};
+TORCH_API at::Tensor leaky_relu_quantized_cpu(const at::Tensor & self, const at::Scalar & negative_slope=0.01);
+TORCH_API at::Tensor & leaky_relu_out_quantized_cpu(const at::Tensor & self, const at::Scalar & negative_slope, at::Tensor & out);
+TORCH_API at::Tensor & leaky_relu_quantized_cpu_(at::Tensor & self, const at::Scalar & negative_slope=0.01);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lift_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lift_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..abd012fd391dc43e1847a657cc9b6f62f7a95512
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lift_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor lift(const at::Tensor & self);
+TORCH_API at::Tensor & lift_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & lift_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_matrix_power_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_matrix_power_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..89e6e55ae4bfffdf0c70c4766ff9da4b8ec8d683
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_matrix_power_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor linalg_matrix_power(const at::Tensor & self, int64_t n);
+TORCH_API at::Tensor & linalg_matrix_power_out(const at::Tensor & self, int64_t n, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_multi_dot_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_multi_dot_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe6e94966dbd632d502f819c4a2a5da2ca9050f2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_multi_dot_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor linalg_multi_dot(at::TensorList tensors);
+TORCH_API at::Tensor & linalg_multi_dot_out(at::Tensor & out, at::TensorList tensors);
+TORCH_API at::Tensor & linalg_multi_dot_outf(at::TensorList tensors, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_qr_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_qr_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..34cada11578a683d81127259c7f359dbcae59862
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_qr_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API linalg_qr {
+  using schema = ::std::tuple<at::Tensor,at::Tensor> (const at::Tensor &, c10::string_view);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::linalg_qr")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "linalg_qr(Tensor A, str mode='reduced') -> (Tensor Q, Tensor R)")
+  static ::std::tuple<at::Tensor,at::Tensor> call(const at::Tensor & A, c10::string_view mode);
+  static ::std::tuple<at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, c10::string_view mode);
+};
+
+struct TORCH_API linalg_qr_out {
+  using schema = ::std::tuple<at::Tensor &,at::Tensor &> (const at::Tensor &, c10::string_view, at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::linalg_qr")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "linalg_qr.out(Tensor A, str mode='reduced', *, Tensor(a!) Q, Tensor(b!) R) -> (Tensor(a!) Q, Tensor(b!) R)")
+  static ::std::tuple<at::Tensor &,at::Tensor &> call(const at::Tensor & A, c10::string_view mode, at::Tensor & Q, at::Tensor & R);
+  static ::std::tuple<at::Tensor &,at::Tensor &> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & A, c10::string_view mode, at::Tensor & Q, at::Tensor & R);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linspace_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linspace_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1165d7d92872c593482ad21e3e2ddbef3d13e7a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linspace_meta_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor & linspace_out(at::Tensor & out, const at::Scalar & start, const at::Scalar & end, int64_t steps);
+TORCH_API at::Tensor & linspace_outf(const at::Scalar & start, const at::Scalar & end, int64_t steps, at::Tensor & out);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logcumsumexp.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logcumsumexp.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd91b11b1bac465c1704fa4821906a9bda963a1f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logcumsumexp.h
@@ -0,0 +1,53 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/logcumsumexp_ops.h>
+
+namespace at {
+
+
+// aten::logcumsumexp(Tensor self, int dim) -> Tensor
+inline at::Tensor logcumsumexp(const at::Tensor & self, int64_t dim) {
+    return at::_ops::logcumsumexp::call(self, dim);
+}
+
+// aten::logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logcumsumexp_out(at::Tensor & out, const at::Tensor & self, int64_t dim) {
+    return at::_ops::logcumsumexp_out::call(self, dim, out);
+}
+// aten::logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logcumsumexp_outf(const at::Tensor & self, int64_t dim, at::Tensor & out) {
+    return at::_ops::logcumsumexp_out::call(self, dim, out);
+}
+
+// aten::logcumsumexp.dimname(Tensor self, Dimname dim) -> Tensor
+inline at::Tensor logcumsumexp(const at::Tensor & self, at::Dimname dim) {
+    return at::_ops::logcumsumexp_dimname::call(self, dim);
+}
+
+// aten::logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logcumsumexp_out(at::Tensor & out, const at::Tensor & self, at::Dimname dim) {
+    return at::_ops::logcumsumexp_dimname_out::call(self, dim, out);
+}
+// aten::logcumsumexp.dimname_out(Tensor self, Dimname dim, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & logcumsumexp_outf(const at::Tensor & self, at::Dimname dim, at::Tensor & out) {
+    return at::_ops::logcumsumexp_dimname_out::call(self, dim, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logit_backward_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logit_backward_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..eefe67d3f8dfad5d6b1b687666c22826ba40d594
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logit_backward_cpu_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor logit_backward(const at::Tensor & grad_output, const at::Tensor & self, c10::optional<double> eps=c10::nullopt);
+TORCH_API at::Tensor & logit_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::optional<double> eps=c10::nullopt);
+TORCH_API at::Tensor & logit_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::optional<double> eps, at::Tensor & grad_input);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/matmul_backward.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/matmul_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..366180b1c3d174e0507a34af698fb754a61bedab
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/matmul_backward.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/matmul_backward_ops.h>
+
+namespace at {
+
+
+// aten::matmul_backward(Tensor grad, Tensor self, Tensor other, bool[2] mask) -> (Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor> matmul_backward(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & other, ::std::array<bool,2> mask) {
+    return at::_ops::matmul_backward::call(grad, self, other, mask);
+}
+
+// aten::matmul_backward.out(Tensor grad, Tensor self, Tensor other, bool[2] mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+inline ::std::tuple<at::Tensor &,at::Tensor &> matmul_backward_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & grad, const at::Tensor & self, const at::Tensor & other, ::std::array<bool,2> mask) {
+    return at::_ops::matmul_backward_out::call(grad, self, other, mask, out0, out1);
+}
+// aten::matmul_backward.out(Tensor grad, Tensor self, Tensor other, bool[2] mask, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+inline ::std::tuple<at::Tensor &,at::Tensor &> matmul_backward_outf(const at::Tensor & grad, const at::Tensor & self, const at::Tensor & other, ::std::array<bool,2> mask, at::Tensor & out0, at::Tensor & out1) {
+    return at::_ops::matmul_backward_out::call(grad, self, other, mask, out0, out1);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool3d_with_indices_backward_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool3d_with_indices_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..1350bf4ec06f18e2077a67c38cf3af99857de50e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool3d_with_indices_backward_native.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor max_pool3d_with_indices_backward_cpu(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices);
+TORCH_API at::Tensor & max_pool3d_with_indices_backward_out_cpu(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices, at::Tensor & grad_input);
+TORCH_API at::Tensor max_pool3d_with_indices_backward_cuda(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices);
+TORCH_API at::Tensor & max_pool3d_with_indices_backward_out_cuda(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices, at::Tensor & grad_input);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_depthwise_convolution_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_depthwise_convolution_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..2f9a3b0f11efb1df8d082a41e2562d2c2bd3df2b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_depthwise_convolution_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & miopen_depthwise_convolution_out_symint(const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, c10::SymIntArrayRef dilation, c10::SymInt groups, bool benchmark, bool deterministic, at::Tensor & out);
+TORCH_API at::Tensor miopen_depthwise_convolution(const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef padding, at::IntArrayRef stride, at::IntArrayRef dilation, int64_t groups, bool benchmark, bool deterministic);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a21a908348721951a13c892ccf46b7c2e11d58b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & mkldnn_adaptive_avg_pool2d_backward_out(at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self);
+TORCH_API at::Tensor & mkldnn_adaptive_avg_pool2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_max_pool2d.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_max_pool2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..95ac45579de670b2a4fa978ec73a3f6c04e300b4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_max_pool2d.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/mkldnn_max_pool2d_ops.h>
+
+namespace at {
+
+
+// aten::mkldnn_max_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> Tensor
+inline at::Tensor mkldnn_max_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+    return at::_ops::mkldnn_max_pool2d::call(self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+
+// aten::mkldnn_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & mkldnn_max_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+    return at::_ops::mkldnn_max_pool2d_out::call(self, kernel_size, stride, padding, dilation, ceil_mode, out);
+}
+// aten::mkldnn_max_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & mkldnn_max_pool2d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out) {
+    return at::_ops::mkldnn_max_pool2d_out::call(self, kernel_size, stride, padding, dilation, ceil_mode, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/movedim_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/movedim_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..c45aa6d3e1ab081f0e62c0bc7bce566e330abad2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/movedim_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor movedim(const at::Tensor & self, at::IntArrayRef source, at::IntArrayRef destination);
+TORCH_API at::Tensor movedim(const at::Tensor & self, int64_t source, int64_t destination);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_group_norm_backward.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_group_norm_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..99c3ef2414fb44710c214b1b5c72f00ee701fbf3
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_group_norm_backward.h
@@ -0,0 +1,91 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/native_group_norm_backward_ops.h>
+
+namespace at {
+
+
+// aten::native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_group_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, int64_t N, int64_t C, int64_t HxW, int64_t group, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_group_norm_backward::call(grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_group_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, int64_t N, int64_t C, int64_t HxW, int64_t group, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_group_norm_backward::call(grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask);
+  }
+}
+
+// aten::native_group_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_group_norm_backward_symint(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_group_norm_backward::call(grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_group_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_group_norm_backward::call(grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask);
+  }
+}
+
+// aten::native_group_norm_backward.out(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, int64_t N, int64_t C, int64_t HxW, int64_t group, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_group_norm_backward_out::call(grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask, out0, out1, out2);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, int64_t N, int64_t C, int64_t HxW, int64_t group, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_group_norm_backward_out::call(grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask, out0, out1, out2);
+  }
+}
+
+// aten::native_group_norm_backward.out(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_backward_outf(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, int64_t N, int64_t C, int64_t HxW, int64_t group, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+    return at::_ops::native_group_norm_backward_out::call(grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask, out0, out1, out2);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_backward_outf(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, int64_t N, int64_t C, int64_t HxW, int64_t group, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+    return at::_ops::native_group_norm_backward_out::call(grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask, out0, out1, out2);
+  }
+}
+
+// aten::native_group_norm_backward.out(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_backward_symint_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_group_norm_backward_out::call(grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask, out0, out1, out2);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_group_norm_backward_out::call(grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask, out0, out1, out2);
+  }
+}
+
+// aten::native_group_norm_backward.out(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, SymInt N, SymInt C, SymInt HxW, int group, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_backward_symint_outf(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+    return at::_ops::native_group_norm_backward_out::call(grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask, out0, out1, out2);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_group_norm_backward_outf(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, c10::SymInt N, c10::SymInt C, c10::SymInt HxW, int64_t group, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+    return at::_ops::native_group_norm_backward_out::call(grad_out, input, mean, rstd, weight, N, C, HxW, group, output_mask, out0, out1, out2);
+  }
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_forward_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_forward_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c8722a8f4acab65399e3b8c56441858ab707f01
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_forward_cuda_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> nll_loss_forward(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> nll_loss_forward_symint(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> nll_loss_forward_out(at::Tensor & output, at::Tensor & total_weight, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> nll_loss_forward_outf(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, int64_t ignore_index, at::Tensor & output, at::Tensor & total_weight);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> nll_loss_forward_symint_out(at::Tensor & output, at::Tensor & total_weight, const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> nll_loss_forward_symint_outf(const at::Tensor & self, const at::Tensor & target, const c10::optional<at::Tensor> & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & output, at::Tensor & total_weight);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/poisson_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/poisson_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..873278408f909df0623739937d9a66bb7d1c1244
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/poisson_cpu_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor poisson(const at::Tensor & self, c10::optional<at::Generator> generator=c10::nullopt);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rand_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rand_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..35d3d537a32d03d6b22f44e19663d1d8dffbf2f6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rand_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,50 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor rand(at::IntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options={});
+TORCH_API at::Tensor rand(at::IntArrayRef size, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+TORCH_API at::Tensor rand_symint(c10::SymIntArrayRef size, c10::optional<at::DimnameList> names, at::TensorOptions options={});
+TORCH_API at::Tensor rand_symint(c10::SymIntArrayRef size, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+TORCH_API at::Tensor & rand_out(at::Tensor & out, at::IntArrayRef size, c10::optional<at::DimnameList> names);
+TORCH_API at::Tensor & rand_outf(at::IntArrayRef size, c10::optional<at::DimnameList> names, at::Tensor & out);
+TORCH_API at::Tensor & rand_symint_out(at::Tensor & out, c10::SymIntArrayRef size, c10::optional<at::DimnameList> names);
+TORCH_API at::Tensor & rand_symint_outf(c10::SymIntArrayRef size, c10::optional<at::DimnameList> names, at::Tensor & out);
+TORCH_API at::Tensor rand(at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::TensorOptions options={});
+TORCH_API at::Tensor rand(at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+TORCH_API at::Tensor rand_symint(c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::TensorOptions options={});
+TORCH_API at::Tensor rand_symint(c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+TORCH_API at::Tensor & rand_out(at::Tensor & out, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names);
+TORCH_API at::Tensor & rand_outf(at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::Tensor & out);
+TORCH_API at::Tensor & rand_symint_out(at::Tensor & out, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names);
+TORCH_API at::Tensor & rand_symint_outf(c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::DimnameList> names, at::Tensor & out);
+TORCH_API at::Tensor rand(at::IntArrayRef size, at::TensorOptions options={});
+TORCH_API at::Tensor rand(at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+TORCH_API at::Tensor rand_symint(c10::SymIntArrayRef size, at::TensorOptions options={});
+TORCH_API at::Tensor rand_symint(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+TORCH_API at::Tensor & rand_out(at::Tensor & out, at::IntArrayRef size);
+TORCH_API at::Tensor & rand_outf(at::IntArrayRef size, at::Tensor & out);
+TORCH_API at::Tensor & rand_symint_out(at::Tensor & out, c10::SymIntArrayRef size);
+TORCH_API at::Tensor & rand_symint_outf(c10::SymIntArrayRef size, at::Tensor & out);
+TORCH_API at::Tensor rand(at::IntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options={});
+TORCH_API at::Tensor rand(at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+TORCH_API at::Tensor rand_symint(c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::TensorOptions options={});
+TORCH_API at::Tensor rand_symint(c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reciprocal.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reciprocal.h
new file mode 100644
index 0000000000000000000000000000000000000000..f102ae6436dfa77ee3b481c1ffe51ccb76824b1b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reciprocal.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/reciprocal_ops.h>
+
+namespace at {
+
+
+// aten::reciprocal(Tensor self) -> Tensor
+inline at::Tensor reciprocal(const at::Tensor & self) {
+    return at::_ops::reciprocal::call(self);
+}
+
+// aten::reciprocal_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & reciprocal_(at::Tensor & self) {
+    return at::_ops::reciprocal_::call(self);
+}
+
+// aten::reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & reciprocal_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::reciprocal_out::call(self, out);
+}
+// aten::reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & reciprocal_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::reciprocal_out::call(self, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu6_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu6_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c82d2524f415576bb7cb50492b0d0e9a5c21a7b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu6_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor relu6(const at::Tensor & self);
+TORCH_API at::Tensor & relu6_(at::Tensor & self);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..286fcd38471985fc327835c9676278d0f6cef255
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu_native.h
@@ -0,0 +1,35 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & relu_out(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor relu(const at::Tensor & self);
+TORCH_API at::Tensor & relu_(at::Tensor & self);
+TORCH_API at::Tensor NestedTensor_relu(const at::Tensor & self);
+TORCH_API at::Tensor & NestedTensor_relu_(at::Tensor & self);
+TORCH_API at::Tensor relu_sparse(const at::Tensor & self);
+TORCH_API at::Tensor & relu_sparse_(at::Tensor & self);
+TORCH_API at::Tensor relu_sparse_csr(const at::Tensor & self);
+TORCH_API at::Tensor & relu_sparse_csr_(at::Tensor & self);
+TORCH_API at::Tensor mkldnn_relu(const at::Tensor & self);
+TORCH_API at::Tensor & mkldnn_relu_(at::Tensor & self);
+TORCH_API at::Tensor relu_quantized_cpu(const at::Tensor & self);
+TORCH_API at::Tensor & relu_quantized_cpu_(at::Tensor & self);
+TORCH_API at::Tensor relu_quantized_cuda(const at::Tensor & self);
+TORCH_API at::Tensor & relu_quantized_cuda_(at::Tensor & self);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4ce9c77f6a0b33925bbb15e391a915e31891849e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_meta_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor remainder(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & remainder_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & remainder_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+TORCH_API at::Tensor & remainder_(at::Tensor & self, const at::Tensor & other);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/resize_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/resize_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..804cc96a6159eea90f5dda8b109536ec37a8a295
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/resize_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor resize(const at::Tensor & self, at::IntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API at::Tensor resize_symint(const at::Tensor & self, c10::SymIntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API const at::Tensor & resize_out(const at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API const at::Tensor & resize_outf(const at::Tensor & self, at::IntArrayRef size, c10::optional<at::MemoryFormat> memory_format, const at::Tensor & out);
+TORCH_API const at::Tensor & resize_symint_out(const at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API const at::Tensor & resize_symint_outf(const at::Tensor & self, c10::SymIntArrayRef size, c10::optional<at::MemoryFormat> memory_format, const at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rnn_tanh_cell_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rnn_tanh_cell_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..99e254426f259cbedd7b6428ac46e800e3097041
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rnn_tanh_cell_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API rnn_tanh_cell {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::rnn_tanh_cell")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "rnn_tanh_cell(Tensor input, Tensor hx, Tensor w_ih, Tensor w_hh, Tensor? b_ih=None, Tensor? b_hh=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const c10::optional<at::Tensor> & b_ih, const c10::optional<at::Tensor> & b_hh);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const c10::optional<at::Tensor> & b_ih, const c10::optional<at::Tensor> & b_hh);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae26e69cbcc8cb779deb327ce69dce2ab5540c43
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API select_copy_int {
+  using schema = at::Tensor (const at::Tensor &, int64_t, c10::SymInt);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::select_copy")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "int")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "select_copy.int(Tensor self, int dim, SymInt index) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, int64_t dim, c10::SymInt index);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::SymInt index);
+};
+
+struct TORCH_API select_copy_int_out {
+  using schema = at::Tensor & (const at::Tensor &, int64_t, c10::SymInt, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::select_copy")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "int_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "select_copy.int_out(Tensor self, int dim, SymInt index, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, int64_t dim, c10::SymInt index, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::SymInt index, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/selu_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/selu_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..d532d93326c135de5ea32552a65fe9c7df859f1a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/selu_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor selu(const at::Tensor & self);
+TORCH_API at::Tensor & selu_(at::Tensor & self);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv3d_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv3d_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c5865bee332ec9332e96e27e54445a4381b8551
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv3d_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor slow_conv3d(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0);
+TORCH_API at::Tensor slow_conv3d_symint(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0));
+TORCH_API at::Tensor & slow_conv3d_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0);
+TORCH_API at::Tensor & slow_conv3d_outf(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, at::IntArrayRef padding, at::Tensor & out);
+TORCH_API at::Tensor & slow_conv3d_symint_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0));
+TORCH_API at::Tensor & slow_conv3d_symint_outf(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/soft_margin_loss.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/soft_margin_loss.h
new file mode 100644
index 0000000000000000000000000000000000000000..43a443e3cdcf87bdb3728e7dfddfcaa5f94c96a4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/soft_margin_loss.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/soft_margin_loss_ops.h>
+
+namespace at {
+
+
+// aten::soft_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & soft_margin_loss_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean) {
+    return at::_ops::soft_margin_loss_out::call(self, target, reduction, out);
+}
+// aten::soft_margin_loss.out(Tensor self, Tensor target, int reduction=Mean, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & soft_margin_loss_outf(const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & out) {
+    return at::_ops::soft_margin_loss_out::call(self, target, reduction, out);
+}
+
+// aten::soft_margin_loss(Tensor self, Tensor target, int reduction=Mean) -> Tensor
+inline at::Tensor soft_margin_loss(const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean) {
+    return at::_ops::soft_margin_loss::call(self, target, reduction);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sparse_sampled_addmm.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sparse_sampled_addmm.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5d9e60a4afa36339040d2fcfb760f4b035ed236
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sparse_sampled_addmm.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/sparse_sampled_addmm_ops.h>
+
+namespace at {
+
+
+// aten::sparse_sampled_addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & sparse_sampled_addmm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+    return at::_ops::sparse_sampled_addmm_out::call(self, mat1, mat2, beta, alpha, out);
+}
+// aten::sparse_sampled_addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & sparse_sampled_addmm_outf(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out) {
+    return at::_ops::sparse_sampled_addmm_out::call(self, mat1, mat2, beta, alpha, out);
+}
+
+// aten::sparse_sampled_addmm(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+inline at::Tensor sparse_sampled_addmm(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1) {
+    return at::_ops::sparse_sampled_addmm::call(self, mat1, mat2, beta, alpha);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v.h
new file mode 100644
index 0000000000000000000000000000000000000000..cdfe3524670215ea738afc21d904254e09f8fbcb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v.h
@@ -0,0 +1,67 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/special_chebyshev_polynomial_v_ops.h>
+
+namespace at {
+
+
+// aten::special_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor
+inline at::Tensor special_chebyshev_polynomial_v(const at::Tensor & x, const at::Tensor & n) {
+    return at::_ops::special_chebyshev_polynomial_v::call(x, n);
+}
+
+// aten::special_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor
+inline at::Tensor special_chebyshev_polynomial_v(const at::Scalar & x, const at::Tensor & n) {
+    return at::_ops::special_chebyshev_polynomial_v_x_scalar::call(x, n);
+}
+
+// aten::special_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor
+inline at::Tensor special_chebyshev_polynomial_v(const at::Tensor & x, const at::Scalar & n) {
+    return at::_ops::special_chebyshev_polynomial_v_n_scalar::call(x, n);
+}
+
+// aten::special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & special_chebyshev_polynomial_v_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n) {
+    return at::_ops::special_chebyshev_polynomial_v_out::call(x, n, out);
+}
+// aten::special_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & special_chebyshev_polynomial_v_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out) {
+    return at::_ops::special_chebyshev_polynomial_v_out::call(x, n, out);
+}
+
+// aten::special_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & special_chebyshev_polynomial_v_out(at::Tensor & out, const at::Scalar & x, const at::Tensor & n) {
+    return at::_ops::special_chebyshev_polynomial_v_x_scalar_out::call(x, n, out);
+}
+// aten::special_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & special_chebyshev_polynomial_v_outf(const at::Scalar & x, const at::Tensor & n, at::Tensor & out) {
+    return at::_ops::special_chebyshev_polynomial_v_x_scalar_out::call(x, n, out);
+}
+
+// aten::special_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & special_chebyshev_polynomial_v_out(at::Tensor & out, const at::Tensor & x, const at::Scalar & n) {
+    return at::_ops::special_chebyshev_polynomial_v_n_scalar_out::call(x, n, out);
+}
+// aten::special_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & special_chebyshev_polynomial_v_outf(const at::Tensor & x, const at::Scalar & n, at::Tensor & out) {
+    return at::_ops::special_chebyshev_polynomial_v_n_scalar_out::call(x, n, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_gammaincc_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_gammaincc_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..675a56d99131267692adaafd6592d23c30b2cfd4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_gammaincc_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API special_gammaincc_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_gammaincc")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_gammaincc.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+};
+
+struct TORCH_API special_gammaincc {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_gammaincc")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_gammaincc(Tensor self, Tensor other) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & other);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_scaled_modified_bessel_k1_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_scaled_modified_bessel_k1_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..92e91ceca225a1c01c84cb15678ff83361b3b9d1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_scaled_modified_bessel_k1_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API special_scaled_modified_bessel_k1 {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_scaled_modified_bessel_k1")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_scaled_modified_bessel_k1(Tensor x) -> Tensor")
+  static at::Tensor call(const at::Tensor & x);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x);
+};
+
+struct TORCH_API special_scaled_modified_bessel_k1_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_scaled_modified_bessel_k1")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_scaled_modified_bessel_k1.out(Tensor x, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & x, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/split_copy.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/split_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e1bbb987bfee8db81de231a8fa337d589324134
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/split_copy.h
@@ -0,0 +1,91 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/split_copy_ops.h>
+
+namespace at {
+
+
+// aten::split_copy.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
+inline ::std::vector<at::Tensor> split_copy(const at::Tensor & self, int64_t split_size, int64_t dim=0) {
+    return at::_ops::split_copy_Tensor::call(self, split_size, dim);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  ::std::vector<at::Tensor> split_copy(const at::Tensor & self, int64_t split_size, int64_t dim=0) {
+    return at::_ops::split_copy_Tensor::call(self, split_size, dim);
+  }
+}
+
+// aten::split_copy.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]
+inline ::std::vector<at::Tensor> split_copy_symint(const at::Tensor & self, c10::SymInt split_size, int64_t dim=0) {
+    return at::_ops::split_copy_Tensor::call(self, split_size, dim);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  ::std::vector<at::Tensor> split_copy(const at::Tensor & self, c10::SymInt split_size, int64_t dim=0) {
+    return at::_ops::split_copy_Tensor::call(self, split_size, dim);
+  }
+}
+
+// aten::split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+inline void split_copy_out(at::TensorList out, const at::Tensor & self, int64_t split_size, int64_t dim=0) {
+    return at::_ops::split_copy_Tensor_out::call(self, split_size, dim, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  void split_copy_out(at::TensorList out, const at::Tensor & self, int64_t split_size, int64_t dim=0) {
+    return at::_ops::split_copy_Tensor_out::call(self, split_size, dim, out);
+  }
+}
+
+// aten::split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+inline void split_copy_outf(const at::Tensor & self, int64_t split_size, int64_t dim, at::TensorList out) {
+    return at::_ops::split_copy_Tensor_out::call(self, split_size, dim, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  void split_copy_outf(const at::Tensor & self, int64_t split_size, int64_t dim, at::TensorList out) {
+    return at::_ops::split_copy_Tensor_out::call(self, split_size, dim, out);
+  }
+}
+
+// aten::split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+inline void split_copy_symint_out(at::TensorList out, const at::Tensor & self, c10::SymInt split_size, int64_t dim=0) {
+    return at::_ops::split_copy_Tensor_out::call(self, split_size, dim, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  void split_copy_out(at::TensorList out, const at::Tensor & self, c10::SymInt split_size, int64_t dim=0) {
+    return at::_ops::split_copy_Tensor_out::call(self, split_size, dim, out);
+  }
+}
+
+// aten::split_copy.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()
+inline void split_copy_symint_outf(const at::Tensor & self, c10::SymInt split_size, int64_t dim, at::TensorList out) {
+    return at::_ops::split_copy_Tensor_out::call(self, split_size, dim, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  void split_copy_outf(const at::Tensor & self, c10::SymInt split_size, int64_t dim, at::TensorList out) {
+    return at::_ops::split_copy_Tensor_out::call(self, split_size, dim, out);
+  }
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sym_constrain_range_for_size.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sym_constrain_range_for_size.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff7d837428a360ddcd6ef86ac69da7330494e5a0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sym_constrain_range_for_size.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/sym_constrain_range_for_size_ops.h>
+
+namespace at {
+
+
+// aten::sym_constrain_range_for_size(Scalar size, *, int? min=None, int? max=None) -> ()
+inline void sym_constrain_range_for_size(const at::Scalar & size, c10::optional<int64_t> min=c10::nullopt, c10::optional<int64_t> max=c10::nullopt) {
+    return at::_ops::sym_constrain_range_for_size::call(size, min, max);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/take_along_dim_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/take_along_dim_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5107a9fc75e7cef1e6ce375ababa3644fdfabbe
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/take_along_dim_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API take_along_dim_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, c10::optional<int64_t>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::take_along_dim")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "take_along_dim.out(Tensor self, Tensor indices, int? dim=None, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & indices, c10::optional<int64_t> dim, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, c10::optional<int64_t> dim, at::Tensor & out);
+};
+
+struct TORCH_API take_along_dim {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, c10::optional<int64_t>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::take_along_dim")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "take_along_dim(Tensor self, Tensor indices, int? dim=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & indices, c10::optional<int64_t> dim);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & indices, c10::optional<int64_t> dim);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unfold_copy.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unfold_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a8e83dad4c828e54b0d237d2b71e2ab8e2b5fbd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unfold_copy.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/unfold_copy_ops.h>
+
+namespace at {
+
+
+// aten::unfold_copy(Tensor self, int dimension, int size, int step) -> Tensor
+inline at::Tensor unfold_copy(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step) {
+    return at::_ops::unfold_copy::call(self, dimension, size, step);
+}
+
+// aten::unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & unfold_copy_out(at::Tensor & out, const at::Tensor & self, int64_t dimension, int64_t size, int64_t step) {
+    return at::_ops::unfold_copy_out::call(self, dimension, size, step, out);
+}
+// aten::unfold_copy.out(Tensor self, int dimension, int size, int step, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & unfold_copy_outf(const at::Tensor & self, int64_t dimension, int64_t size, int64_t step, at::Tensor & out) {
+    return at::_ops::unfold_copy_out::call(self, dimension, size, step, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/xlogy_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/xlogy_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..9cf078c3ba8da78c1c5910d7171238e7cd238e4b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/xlogy_native.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/xlogy_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_xlogy_out : public at::meta::structured_xlogy_Tensor {
+void impl(const at::Tensor & self, const at::Tensor & other, const at::Tensor & out);
+};
+TORCH_API at::Tensor xlogy(const at::Scalar & self, const at::Tensor & other);
+TORCH_API at::Tensor & xlogy_out(const at::Scalar & self, const at::Tensor & other, at::Tensor & out);
+TORCH_API at::Tensor xlogy(const at::Tensor & self, const at::Scalar & other);
+TORCH_API at::Tensor & xlogy_out(const at::Tensor & self, const at::Scalar & other, at::Tensor & out);
+TORCH_API at::Tensor & xlogy_(at::Tensor & self, const at::Scalar & other);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/zeros_like_compositeimplicitautogradnestedtensor_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/zeros_like_compositeimplicitautogradnestedtensor_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c8474bbea502c688f76aa12b464cb8ee06f32b6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/zeros_like_compositeimplicitautogradnestedtensor_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautogradnestedtensor {
+
+TORCH_API at::Tensor zeros_like(const at::Tensor & self, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API at::Tensor zeros_like(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format);
+
+} // namespace compositeimplicitautogradnestedtensor
+} // namespace at