diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_amp_update_scale.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_amp_update_scale.h
new file mode 100644
index 0000000000000000000000000000000000000000..85228b5bfebb7a6345f2c0759c1486948cd310dc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_amp_update_scale.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_amp_update_scale_ops.h>
+
+namespace at {
+
+
+// aten::_amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!)
+inline at::Tensor & _amp_update_scale_(at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) {
+    return at::_ops::_amp_update_scale_::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval);
+}
+
+// aten::_amp_update_scale.out(Tensor self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _amp_update_scale_out(at::Tensor & out, const at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) {
+    return at::_ops::_amp_update_scale_out::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval, out);
+}
+// aten::_amp_update_scale.out(Tensor self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _amp_update_scale_outf(const at::Tensor & self, at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval, at::Tensor & out) {
+    return at::_ops::_amp_update_scale_out::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval, out);
+}
+
+// aten::_amp_update_scale(Tensor self, Tensor growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> (Tensor, Tensor growth_tracker_out)
+inline ::std::tuple<at::Tensor,at::Tensor> _amp_update_scale(const at::Tensor & self, const at::Tensor & growth_tracker, const at::Tensor & found_inf, double scale_growth_factor, double scale_backoff_factor, int64_t growth_interval) {
+    return at::_ops::_amp_update_scale::call(self, growth_tracker, found_inf, scale_growth_factor, scale_backoff_factor, growth_interval);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_assert_async_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_assert_async_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0cdfd0b7dc8f080eecc8f0a5750676d812dac22c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_assert_async_cuda_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API void _assert_async(const at::Tensor & self);
+TORCH_API void _assert_async(const at::Tensor & self, c10::string_view assert_msg);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_batch_norm_impl_index.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_batch_norm_impl_index.h
new file mode 100644
index 0000000000000000000000000000000000000000..9726f9934bb6518f2c017c4bd1386f51da9074bb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_batch_norm_impl_index.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_batch_norm_impl_index_ops.h>
+
+namespace at {
+
+
+// aten::_batch_norm_impl_index(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> (Tensor, Tensor, Tensor, Tensor, int)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,int64_t> _batch_norm_impl_index(const at::Tensor & input, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double momentum, double eps, bool cudnn_enabled) {
+    return at::_ops::_batch_norm_impl_index::call(input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..31ffaca726d6fc061def1597b1468746f9afc35b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_dense_backward_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & _embedding_bag_dense_backward_out_symint(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx, at::Tensor & out);
+TORCH_API at::Tensor _embedding_bag_dense_backward_cpu(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
+TORCH_API at::Tensor _embedding_bag_dense_backward_cuda(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offset2bag, const at::Tensor & bag_size, const at::Tensor & maximum_indices, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams.h
new file mode 100644
index 0000000000000000000000000000000000000000..5dd8422289e8197308409de4d24ec92918be38cb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_ops.h>
+
+namespace at {
+
+
+// aten::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max) -> (Tensor output, Tensor mask)
+inline ::std::tuple<at::Tensor,at::Tensor> _fake_quantize_per_tensor_affine_cachemask_tensor_qparams(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, const at::Tensor & fake_quant_enabled, int64_t quant_min, int64_t quant_max) {
+    return at::_ops::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams::call(self, scale, zero_point, fake_quant_enabled, quant_min, quant_max);
+}
+
+// aten::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+inline ::std::tuple<at::Tensor &,at::Tensor &> _fake_quantize_per_tensor_affine_cachemask_tensor_qparams_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, const at::Tensor & fake_quant_enabled, int64_t quant_min, int64_t quant_max) {
+    return at::_ops::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_out::call(self, scale, zero_point, fake_quant_enabled, quant_min, quant_max, out0, out1);
+}
+// aten::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out(Tensor self, Tensor scale, Tensor zero_point, Tensor fake_quant_enabled, int quant_min, int quant_max, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+inline ::std::tuple<at::Tensor &,at::Tensor &> _fake_quantize_per_tensor_affine_cachemask_tensor_qparams_outf(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, const at::Tensor & fake_quant_enabled, int64_t quant_min, int64_t quant_max, at::Tensor & out0, at::Tensor & out1) {
+    return at::_ops::_fake_quantize_per_tensor_affine_cachemask_tensor_qparams_out::call(self, scale, zero_point, fake_quant_enabled, quant_min, quant_max, out0, out1);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfa5e50ea148351730407548a030209f0eac7aa4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_log.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_foreach_log_ops.h>
+
+namespace at {
+
+
+// aten::_foreach_log(Tensor[] self) -> Tensor[]
+inline ::std::vector<at::Tensor> _foreach_log(at::TensorList self) {
+    return at::_ops::_foreach_log::call(self);
+}
+
+// aten::_foreach_log_(Tensor(a!)[] self) -> ()
+inline void _foreach_log_(at::TensorList self) {
+    return at::_ops::_foreach_log_::call(self);
+}
+
+// aten::_foreach_log.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_log_out(at::TensorList out, at::TensorList self) {
+    return at::_ops::_foreach_log_out::call(self, out);
+}
+// aten::_foreach_log.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_log_outf(at::TensorList self, at::TensorList out) {
+    return at::_ops::_foreach_log_out::call(self, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_sign_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_sign_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f40c9177ab4aaa82eff1edfb725fb0bb34653826
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_sign_cpu_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API ::std::vector<at::Tensor> _foreach_sign(at::TensorList self);
+TORCH_API void _foreach_sign_(at::TensorList self);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_sinh_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_sinh_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb5de3828be11e3c959ec24e9d1738ab7da69c15
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_sinh_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API void _foreach_sinh_out(at::TensorList out, at::TensorList self);
+TORCH_API void _foreach_sinh_outf(at::TensorList self, at::TensorList out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fw_primal_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fw_primal_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ab748f7e3828c0398649a363b35c0fe2a2e90ff
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fw_primal_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor _fw_primal(const at::Tensor & self, int64_t level);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_is_any_true_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_is_any_true_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..d49a2682673648b539750ae39e5298329a1a9a86
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_is_any_true_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor _is_any_true(const at::Tensor & self);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_linalg_svd.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_linalg_svd.h
new file mode 100644
index 0000000000000000000000000000000000000000..a31348d1eab9b3b468cd86adb51f8905d6afbc2d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_linalg_svd.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_linalg_svd_ops.h>
+
+namespace at {
+
+
+// aten::_linalg_svd(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _linalg_svd(const at::Tensor & A, bool full_matrices=false, bool compute_uv=true, c10::optional<c10::string_view> driver=c10::nullopt) {
+    return at::_ops::_linalg_svd::call(A, full_matrices, compute_uv, driver);
+}
+
+// aten::_linalg_svd.U(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _linalg_svd_out(at::Tensor & U, at::Tensor & S, at::Tensor & Vh, const at::Tensor & A, bool full_matrices=false, bool compute_uv=true, c10::optional<c10::string_view> driver=c10::nullopt) {
+    return at::_ops::_linalg_svd_U::call(A, full_matrices, compute_uv, driver, U, S, Vh);
+}
+// aten::_linalg_svd.U(Tensor A, bool full_matrices=False, bool compute_uv=True, *, str? driver=None, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _linalg_svd_outf(const at::Tensor & A, bool full_matrices, bool compute_uv, c10::optional<c10::string_view> driver, at::Tensor & U, at::Tensor & S, at::Tensor & Vh) {
+    return at::_ops::_linalg_svd_U::call(A, full_matrices, compute_uv, driver, U, S, Vh);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_make_dual_copy.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_make_dual_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..82a3a2eb6349b384eb91d2d3d627a40f77e92efe
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_make_dual_copy.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_make_dual_copy_ops.h>
+
+namespace at {
+
+
+// aten::_make_dual_copy(Tensor primal, Tensor tangent, int level) -> Tensor
+inline at::Tensor _make_dual_copy(const at::Tensor & primal, const at::Tensor & tangent, int64_t level) {
+    return at::_ops::_make_dual_copy::call(primal, tangent, level);
+}
+
+// aten::_make_dual_copy.out(Tensor primal, Tensor tangent, int level, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _make_dual_copy_out(at::Tensor & out, const at::Tensor & primal, const at::Tensor & tangent, int64_t level) {
+    return at::_ops::_make_dual_copy_out::call(primal, tangent, level, out);
+}
+// aten::_make_dual_copy.out(Tensor primal, Tensor tangent, int level, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _make_dual_copy_outf(const at::Tensor & primal, const at::Tensor & tangent, int64_t level, at::Tensor & out) {
+    return at::_ops::_make_dual_copy_out::call(primal, tangent, level, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_select_backward_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_select_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..6521ea4883f98aa6eadddc87e5e7ebdfaaf3d937
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_select_backward_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _nested_select_backward {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, int64_t, c10::SymInt);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_nested_select_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_nested_select_backward(Tensor grad_output, Tensor self, int dim, SymInt index) -> Tensor")
+  static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, c10::SymInt index);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, c10::SymInt index);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_new_zeros_with_same_feature_meta_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_new_zeros_with_same_feature_meta_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3cec2c29433521430615a473a2aec1b47007e4f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_new_zeros_with_same_feature_meta_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor _new_zeros_with_same_feature_meta(const at::Tensor & self, const at::Tensor & other, int64_t self_num_batch_dims=0);
+TORCH_API at::Tensor & _new_zeros_with_same_feature_meta_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, int64_t self_num_batch_dims=0);
+TORCH_API at::Tensor & _new_zeros_with_same_feature_meta_outf(const at::Tensor & self, const at::Tensor & other, int64_t self_num_batch_dims, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_prelu_kernel.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_prelu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5f34405328ec6b3f720e1eeda0bd36beb7235e8
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_prelu_kernel.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_prelu_kernel_ops.h>
+
+namespace at {
+
+
+// aten::_prelu_kernel(Tensor self, Tensor weight) -> Tensor
+inline at::Tensor _prelu_kernel(const at::Tensor & self, const at::Tensor & weight) {
+    return at::_ops::_prelu_kernel::call(self, weight);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_segment_reduce_backward_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_segment_reduce_backward_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cd97c9f0777a2bcd44ec1441c9137becd79b493
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_segment_reduce_backward_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & _segment_reduce_backward_out(at::Tensor & out, const at::Tensor & grad, const at::Tensor & output, const at::Tensor & data, c10::string_view reduce, const c10::optional<at::Tensor> & lengths={}, const c10::optional<at::Tensor> & offsets={}, int64_t axis=0, const c10::optional<at::Scalar> & initial=c10::nullopt);
+TORCH_API at::Tensor & _segment_reduce_backward_outf(const at::Tensor & grad, const at::Tensor & output, const at::Tensor & data, c10::string_view reduce, const c10::optional<at::Tensor> & lengths, const c10::optional<at::Tensor> & offsets, int64_t axis, const c10::optional<at::Scalar> & initial, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_shape_as_tensor_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_shape_as_tensor_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc6e733d7b8e34b5f76327392660a7940167e468
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_shape_as_tensor_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _shape_as_tensor {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_shape_as_tensor")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_shape_as_tensor(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_standard_gamma_grad_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_standard_gamma_grad_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b98006c864ae7010ba91a6335bd02fab78a6c208
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_standard_gamma_grad_cpu_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor _standard_gamma_grad(const at::Tensor & self, const at::Tensor & output);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_ambiguous_defaults_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_ambiguous_defaults_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..1229fda9e74dfcc2fe71059dab2c240277dd64cd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_ambiguous_defaults_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _test_ambiguous_defaults_a {
+  using schema = at::Tensor (const at::Tensor &, int64_t, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_test_ambiguous_defaults")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "a")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_test_ambiguous_defaults.a(Tensor dummy, int a=1, int b=1) -> Tensor")
+  static at::Tensor call(const at::Tensor & dummy, int64_t a, int64_t b);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & dummy, int64_t a, int64_t b);
+};
+
+struct TORCH_API _test_ambiguous_defaults_b {
+  using schema = at::Tensor (const at::Tensor &, int64_t, c10::string_view);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_test_ambiguous_defaults")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "b")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_test_ambiguous_defaults.b(Tensor dummy, int a=2, str b=\"2\") -> Tensor")
+  static at::Tensor call(const at::Tensor & dummy, int64_t a, c10::string_view b);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & dummy, int64_t a, c10::string_view b);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_transformer_encoder_layer_fwd.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_transformer_encoder_layer_fwd.h
new file mode 100644
index 0000000000000000000000000000000000000000..c521c493fcfe03f648b51a3c5212ef01fa81ebd1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_transformer_encoder_layer_fwd.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_transformer_encoder_layer_fwd_ops.h>
+
+namespace at {
+
+
+// aten::_transformer_encoder_layer_fwd(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None) -> Tensor
+inline at::Tensor _transformer_encoder_layer_fwd(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask={}, c10::optional<int64_t> mask_type=c10::nullopt) {
+    return at::_ops::_transformer_encoder_layer_fwd::call(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, mask_type);
+}
+
+// aten::_transformer_encoder_layer_fwd.out(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _transformer_encoder_layer_fwd_out(at::Tensor & out, const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask={}, c10::optional<int64_t> mask_type=c10::nullopt) {
+    return at::_ops::_transformer_encoder_layer_fwd_out::call(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, mask_type, out);
+}
+// aten::_transformer_encoder_layer_fwd.out(Tensor src, int embed_dim, int num_heads, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, bool use_gelu, bool norm_first, float eps, Tensor norm_weight_1, Tensor norm_bias_1, Tensor norm_weight_2, Tensor norm_bias_2, Tensor ffn_weight_1, Tensor ffn_bias_1, Tensor ffn_weight_2, Tensor ffn_bias_2, Tensor? mask=None, int? mask_type=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _transformer_encoder_layer_fwd_outf(const at::Tensor & src, int64_t embed_dim, int64_t num_heads, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, bool use_gelu, bool norm_first, double eps, const at::Tensor & norm_weight_1, const at::Tensor & norm_bias_1, const at::Tensor & norm_weight_2, const at::Tensor & norm_bias_2, const at::Tensor & ffn_weight_1, const at::Tensor & ffn_bias_1, const at::Tensor & ffn_weight_2, const at::Tensor & ffn_bias_2, const c10::optional<at::Tensor> & mask, c10::optional<int64_t> mask_type, at::Tensor & out) {
+    return at::_ops::_transformer_encoder_layer_fwd_out::call(src, embed_dim, num_heads, qkv_weight, qkv_bias, proj_weight, proj_bias, use_gelu, norm_first, eps, norm_weight_1, norm_bias_1, norm_weight_2, norm_bias_2, ffn_weight_1, ffn_bias_1, ffn_weight_2, ffn_bias_2, mask, mask_type, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7b0310b745941fa1a5c73c8ea1d2c6a839036ff
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bicubic2d_aa_cuda_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor _upsample_bicubic2d_aa(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt);
+TORCH_API at::Tensor _upsample_bicubic2d_aa_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt);
+TORCH_API at::Tensor & _upsample_bicubic2d_aa_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt);
+TORCH_API at::Tensor & _upsample_bicubic2d_aa_outf(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out);
+TORCH_API at::Tensor & _upsample_bicubic2d_aa_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt);
+TORCH_API at::Tensor & _upsample_bicubic2d_aa_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce39c759f24919f49c6250d923664e6c9b586ebe
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_weight_norm_interface_backward_cpu_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> _weight_norm_interface_backward(const at::Tensor & grad_w, const at::Tensor & saved_v, const at::Tensor & saved_g, const at::Tensor & saved_norms, int64_t dim);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..57c0d6e74969cdbd57373b088e9dd2fcc9b22354
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_max_pool3d_backward_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor adaptive_max_pool3d_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices);
+TORCH_API at::Tensor & adaptive_max_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & indices, at::Tensor & grad_input);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/aminmax_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/aminmax_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd2795703b8cca8089c550a306b4b1017ba33d84
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/aminmax_meta_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> aminmax(const at::Tensor & self, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> aminmax_out(at::Tensor & min, at::Tensor & max, const at::Tensor & self, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> aminmax_outf(const at::Tensor & self, c10::optional<int64_t> dim, bool keepdim, at::Tensor & min, at::Tensor & max);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arctan2.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arctan2.h
new file mode 100644
index 0000000000000000000000000000000000000000..47ebab57eafa3b3ce04aebf8c8a8c36132e2276f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arctan2.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/arctan2_ops.h>
+
+namespace at {
+
+
+// aten::arctan2(Tensor self, Tensor other) -> Tensor
+inline at::Tensor arctan2(const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::arctan2::call(self, other);
+}
+
+// aten::arctan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & arctan2_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::arctan2_out::call(self, other, out);
+}
+// aten::arctan2.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & arctan2_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+    return at::_ops::arctan2_out::call(self, other, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atleast_3d_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atleast_3d_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a40adba4099a39cfd0b7e34f2cf8ecbf1f8f115
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atleast_3d_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor atleast_3d(const at::Tensor & self);
+TORCH_API ::std::vector<at::Tensor> atleast_3d(at::TensorList tensors);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/baddbmm_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/baddbmm_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..5062fd6e296bc170a33dcb91b8c2f47a4f87632b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/baddbmm_native.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/baddbmm_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_baddbmm_out_cpu : public at::meta::structured_baddbmm {
+void impl(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, const at::Tensor & out);
+};
+struct TORCH_API structured_baddbmm_out_cuda : public at::meta::structured_baddbmm {
+void impl(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, const at::Tensor & out);
+};
+TORCH_API at::Tensor & baddbmm_out_sparse_csr_cuda(const at::Tensor & self, const at::Tensor & batch1, const at::Tensor & batch2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_not.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_not.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9b96053c48cfa05340425ed162c6784564f79bb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_not.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/bitwise_not_ops.h>
+
+namespace at {
+
+
+// aten::bitwise_not(Tensor self) -> Tensor
+inline at::Tensor bitwise_not(const at::Tensor & self) {
+    return at::_ops::bitwise_not::call(self);
+}
+
+// aten::bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_not_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::bitwise_not_out::call(self, out);
+}
+// aten::bitwise_not.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_not_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::bitwise_not_out::call(self, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cdist.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cdist.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e47bfcec353107cd45a179d699112361a48ea6b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cdist.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/cdist_ops.h>
+
+namespace at {
+
+
+// aten::cdist(Tensor x1, Tensor x2, float p=2, int? compute_mode=None) -> Tensor
+inline at::Tensor cdist(const at::Tensor & x1, const at::Tensor & x2, double p=2, c10::optional<int64_t> compute_mode=c10::nullopt) {
+    return at::_ops::cdist::call(x1, x2, p, compute_mode);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/clone_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/clone_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..048e3f1baed01a7f9fd2142b02fa643bd36801d8
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/clone_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor clone(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API at::Tensor & clone_out(at::Tensor & out, const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API at::Tensor & clone_outf(const at::Tensor & self, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/convolution_backward_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/convolution_backward_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..179abb68f754e9d79e45f18988572ee10dde3723
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/convolution_backward_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> convolution_backward_symint(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalIntArrayRef bias_sizes, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_symint_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> convolution_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & input, const at::Tensor & weight, at::OptionalSymIntArrayRef bias_sizes, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/copy_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/copy_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..7cc0f58ad766b4ff5321a64c6e1b7105a3d80d1d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/copy_native.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & copy_out(const at::Tensor & self, const at::Tensor & src, bool non_blocking, at::Tensor & out);
+TORCH_API at::Tensor & copy_(at::Tensor & self, const at::Tensor & src, bool non_blocking=false);
+TORCH_API at::Tensor & copy_nested_(at::Tensor & self, const at::Tensor & src, bool non_blocking=false);
+TORCH_API at::Tensor & copy_sparse_wrapper_(at::Tensor & self, const at::Tensor & src, bool non_blocking=false);
+TORCH_API at::Tensor & copy_sparse_compressed_(at::Tensor & self, const at::Tensor & src, bool non_blocking=false);
+TORCH_API at::Tensor & copy_mkldnn_(at::Tensor & self, const at::Tensor & src, bool non_blocking=false);
+TORCH_API at::Tensor copy(const at::Tensor & self, const at::Tensor & src, bool non_blocking=false);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diag_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diag_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ce479e2af13630a88b4d15797aed97a9b7ab4d2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diag_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API diag_out {
+  using schema = at::Tensor & (const at::Tensor &, int64_t, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::diag")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "diag.out(Tensor self, int diagonal=0, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, int64_t diagonal, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t diagonal, at::Tensor & out);
+};
+
+struct TORCH_API diag {
+  using schema = at::Tensor (const at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::diag")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "diag(Tensor self, int diagonal=0) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, int64_t diagonal);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t diagonal);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diagonal_scatter_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diagonal_scatter_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..48da4d394acc4e1470d4df3c5dd5006be7a2796d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diagonal_scatter_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & diagonal_scatter_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & src, int64_t offset=0, int64_t dim1=0, int64_t dim2=1);
+TORCH_API at::Tensor & diagonal_scatter_outf(const at::Tensor & self, const at::Tensor & src, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/empty_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/empty_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..40e5fa0e276039a4214f1e6f404efccd99331827
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/empty_native.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor empty_names(at::IntArrayRef size, c10::optional<at::DimnameList> names, c10::optional<at::ScalarType> dtype={}, c10::optional<at::Layout> layout={}, c10::optional<at::Device> device={}, c10::optional<bool> pin_memory={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API at::Tensor & empty_names_out(at::IntArrayRef size, c10::optional<at::DimnameList> names, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out);
+TORCH_API at::Tensor & empty_out(at::IntArrayRef size, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out);
+TORCH_API at::Tensor empty_cpu(at::IntArrayRef size, c10::optional<at::ScalarType> dtype={}, c10::optional<at::Layout> layout={}, c10::optional<at::Device> device={}, c10::optional<bool> pin_memory={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API at::Tensor empty_cuda(at::IntArrayRef size, c10::optional<at::ScalarType> dtype={}, c10::optional<at::Layout> layout={}, c10::optional<at::Device> device={}, c10::optional<bool> pin_memory={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API at::Tensor empty_sparse(at::IntArrayRef size, c10::optional<at::ScalarType> dtype={}, c10::optional<at::Layout> layout={}, c10::optional<at::Device> device={}, c10::optional<bool> pin_memory={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API at::Tensor empty_sparse_compressed(at::IntArrayRef size, c10::optional<at::ScalarType> dtype={}, c10::optional<at::Layout> layout={}, c10::optional<at::Device> device={}, c10::optional<bool> pin_memory={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API at::Tensor empty_meta_symint(c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype={}, c10::optional<at::Layout> layout={}, c10::optional<at::Device> device={}, c10::optional<bool> pin_memory={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API at::Tensor empty_mkldnn(at::IntArrayRef size, c10::optional<at::ScalarType> dtype={}, c10::optional<at::Layout> layout={}, c10::optional<at::Device> device={}, c10::optional<bool> pin_memory={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API at::Tensor empty_unknown_quantized(at::IntArrayRef size, c10::optional<at::ScalarType> dtype={}, c10::optional<at::Layout> layout={}, c10::optional<at::Device> device={}, c10::optional<bool> pin_memory={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/empty_strided_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/empty_strided_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3cb51333816546f5aff174bc5fde4bb94391e45b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/empty_strided_meta_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride, at::TensorOptions options={});
+TORCH_API at::Tensor empty_strided(at::IntArrayRef size, at::IntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+TORCH_API at::Tensor empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::TensorOptions options={});
+TORCH_API at::Tensor empty_strided_symint(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/empty_strided_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/empty_strided_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd3a1d44caf0941417c7f1b9c9d122e4c77a0336
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/empty_strided_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API empty_strided {
+  using schema = at::Tensor (c10::SymIntArrayRef, c10::SymIntArrayRef, c10::optional<at::ScalarType>, c10::optional<at::Layout>, c10::optional<at::Device>, c10::optional<bool>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::empty_strided")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "empty_strided(SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor")
+  static at::Tensor call(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+};
+
+struct TORCH_API empty_strided_out {
+  using schema = at::Tensor & (c10::SymIntArrayRef, c10::SymIntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::empty_strided")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "empty_strided.out(SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/exp_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/exp_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fb15f44bc794f0c59bced73b98ff5b967ac35da
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/exp_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_exp : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & self);
+};
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_hfft_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_hfft_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..47de3841d9220c3cd419fcaef000c6cd15dc834a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_hfft_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor fft_hfft_symint(const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt);
+TORCH_API at::Tensor & fft_hfft_symint_out(const at::Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/flatten_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/flatten_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..484326525aabb5dfe48376df5032187fa90c6067
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/flatten_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor flatten(const at::Tensor & self, int64_t start_dim=0, int64_t end_dim=-1);
+TORCH_API at::Tensor flatten(const at::Tensor & self, int64_t start_dim, int64_t end_dim, at::Dimname out_dim);
+TORCH_API at::Tensor flatten(const at::Tensor & self, at::Dimname start_dim, at::Dimname end_dim, at::Dimname out_dim);
+TORCH_API at::Tensor flatten(const at::Tensor & self, at::DimnameList dims, at::Dimname out_dim);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fmod_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fmod_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5842c4a4bc52301f61dc685e316efc068e63f5d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fmod_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor fmod(const at::Tensor & self, const at::Scalar & other);
+TORCH_API at::Tensor & fmod_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other);
+TORCH_API at::Tensor & fmod_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out);
+TORCH_API at::Tensor & fmod_(at::Tensor & self, const at::Scalar & other);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/full_like_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/full_like_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..a37accf4c783dd7314208ee702259bfac0fe16a9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/full_like_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor full_like(const at::Tensor & self, const at::Scalar & fill_value, c10::optional<at::ScalarType> dtype={}, c10::optional<at::Layout> layout={}, c10::optional<at::Device> device={}, c10::optional<bool> pin_memory={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt);
+TORCH_API at::Tensor & full_like_out(const at::Tensor & self, const at::Scalar & fill_value, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_coalesced.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_coalesced.h
new file mode 100644
index 0000000000000000000000000000000000000000..02adf420b8f60128d326766f8c8b68bae7edc816
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_coalesced.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/is_coalesced_ops.h>
+
+namespace at {
+
+
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_vulkan_available.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_vulkan_available.h
new file mode 100644
index 0000000000000000000000000000000000000000..16284213f697652c484ae36a1d773bf527194172
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_vulkan_available.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/is_vulkan_available_ops.h>
+
+namespace at {
+
+
+// aten::is_vulkan_available() -> bool
+inline bool is_vulkan_available() {
+    return at::_ops::is_vulkan_available::call();
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/l1_loss_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/l1_loss_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..dfdf6af5fe56b6775385ac8417045acadbd0308f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/l1_loss_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor l1_loss(const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lgamma.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lgamma.h
new file mode 100644
index 0000000000000000000000000000000000000000..2479d0374d78a5fcd345a49724aec689b757e878
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lgamma.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/lgamma_ops.h>
+
+namespace at {
+
+
+// aten::lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & lgamma_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::lgamma_out::call(self, out);
+}
+// aten::lgamma.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & lgamma_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::lgamma_out::call(self, out);
+}
+
+// aten::lgamma(Tensor self) -> Tensor
+inline at::Tensor lgamma(const at::Tensor & self) {
+    return at::_ops::lgamma::call(self);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_vector_norm_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_vector_norm_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..c46f1037217dc754867bc80a72083644a84d0234
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_vector_norm_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API linalg_vector_norm {
+  using schema = at::Tensor (const at::Tensor &, const at::Scalar &, at::OptionalIntArrayRef, bool, c10::optional<at::ScalarType>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::linalg_vector_norm")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "linalg_vector_norm(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Scalar & ord, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & ord, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype);
+};
+
+struct TORCH_API linalg_vector_norm_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::OptionalIntArrayRef, bool, c10::optional<at::ScalarType>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::linalg_vector_norm")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Scalar & ord, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & ord, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/log1p_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/log1p_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f89b0a6447d9db4dda2150924abe066f5d073dd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/log1p_meta_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor log1p(const at::Tensor & self);
+TORCH_API at::Tensor & log1p_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & log1p_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & log1p_(at::Tensor & self);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/matrix_exp_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/matrix_exp_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba48acf20658fa668a0e6e5db30879f276ad8207
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/matrix_exp_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor matrix_exp(const at::Tensor & self);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/matrix_power_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/matrix_power_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..39ca5332f208177b05ea7e723bd81c5b64f4ce74
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/matrix_power_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API matrix_power {
+  using schema = at::Tensor (const at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::matrix_power")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "matrix_power(Tensor self, int n) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, int64_t n);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t n);
+};
+
+struct TORCH_API matrix_power_out {
+  using schema = at::Tensor & (const at::Tensor &, int64_t, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::matrix_power")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "matrix_power.out(Tensor self, int n, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, int64_t n, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t n, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_batch_norm_backward_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_batch_norm_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..bb57e0461025210400f06613c278d73201dd327e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_batch_norm_backward_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> miopen_batch_norm_backward_out(const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_var, double epsilon, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> miopen_batch_norm_backward(const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_var, double epsilon);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..11600e74144250ff9709b24ee035b82287d0c1e0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_backward_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & mkldnn_adaptive_avg_pool2d_backward_out(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor mkldnn_adaptive_avg_pool2d_backward(const at::Tensor & grad_output, const at::Tensor & self);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_channel_shuffle_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_channel_shuffle_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..47f776d1870ee79c0cb6797ee1bfd13e07a1ab03
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_channel_shuffle_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor native_channel_shuffle(const at::Tensor & self, int64_t groups);
+TORCH_API at::Tensor native_channel_shuffle_symint(const at::Tensor & self, c10::SymInt groups);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_layer_norm_backward.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_layer_norm_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f6ad69ad11ce08e9112b547e09f3899b36226fc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_layer_norm_backward.h
@@ -0,0 +1,91 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/native_layer_norm_backward_ops.h>
+
+namespace at {
+
+
+// aten::native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_layer_norm_backward::call(grad_out, input, c10::fromIntArrayRefSlow(normalized_shape), mean, rstd, weight, bias, output_mask);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_layer_norm_backward::call(grad_out, input, c10::fromIntArrayRefSlow(normalized_shape), mean, rstd, weight, bias, output_mask);
+  }
+}
+
+// aten::native_layer_norm_backward(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward_symint(const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_layer_norm_backward::call(grad_out, input, normalized_shape, mean, rstd, weight, bias, output_mask);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_backward(const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_layer_norm_backward::call(grad_out, input, normalized_shape, mean, rstd, weight, bias, output_mask);
+  }
+}
+
+// aten::native_layer_norm_backward.out(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_layer_norm_backward_out::call(grad_out, input, c10::fromIntArrayRefSlow(normalized_shape), mean, rstd, weight, bias, output_mask, out0, out1, out2);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_layer_norm_backward_out::call(grad_out, input, c10::fromIntArrayRefSlow(normalized_shape), mean, rstd, weight, bias, output_mask, out0, out1, out2);
+  }
+}
+
+// aten::native_layer_norm_backward.out(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_backward_outf(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+    return at::_ops::native_layer_norm_backward_out::call(grad_out, input, c10::fromIntArrayRefSlow(normalized_shape), mean, rstd, weight, bias, output_mask, out0, out1, out2);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_backward_outf(const at::Tensor & grad_out, const at::Tensor & input, at::IntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+    return at::_ops::native_layer_norm_backward_out::call(grad_out, input, c10::fromIntArrayRefSlow(normalized_shape), mean, rstd, weight, bias, output_mask, out0, out1, out2);
+  }
+}
+
+// aten::native_layer_norm_backward.out(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_backward_symint_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_layer_norm_backward_out::call(grad_out, input, normalized_shape, mean, rstd, weight, bias, output_mask, out0, out1, out2);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask) {
+    return at::_ops::native_layer_norm_backward_out::call(grad_out, input, normalized_shape, mean, rstd, weight, bias, output_mask, out0, out1, out2);
+  }
+}
+
+// aten::native_layer_norm_backward.out(Tensor grad_out, Tensor input, SymInt[] normalized_shape, Tensor mean, Tensor rstd, Tensor? weight, Tensor? bias, bool[3] output_mask, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_backward_symint_outf(const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+    return at::_ops::native_layer_norm_backward_out::call(grad_out, input, normalized_shape, mean, rstd, weight, bias, output_mask, out0, out1, out2);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_backward_outf(const at::Tensor & grad_out, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const at::Tensor & mean, const at::Tensor & rstd, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+    return at::_ops::native_layer_norm_backward_out::call(grad_out, input, normalized_shape, mean, rstd, weight, bias, output_mask, out0, out1, out2);
+  }
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..690c0c07ca7e45d7bcf6159b242116fb8170eadc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor nested_to_padded_tensor(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size=c10::nullopt);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/normal.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/normal.h
new file mode 100644
index 0000000000000000000000000000000000000000..e627042d084e7c4af359f6efe328c3d9dd743610
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/normal.h
@@ -0,0 +1,169 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/normal_ops.h>
+
+namespace at {
+
+
+// aten::normal_functional(Tensor self, float mean=0, float std=1, *, Generator? generator=None) -> Tensor
+inline at::Tensor normal_functional(const at::Tensor & self, double mean=0, double std=1, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::normal_functional::call(self, mean, std, generator);
+}
+
+// aten::normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & normal_out(at::Tensor & out, const at::Tensor & mean, double std=1, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::normal_Tensor_float_out::call(mean, std, generator, out);
+}
+// aten::normal.Tensor_float_out(Tensor mean, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & normal_outf(const at::Tensor & mean, double std, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::normal_Tensor_float_out::call(mean, std, generator, out);
+}
+
+// aten::normal.Tensor_float(Tensor mean, float std=1, *, Generator? generator=None) -> Tensor
+inline at::Tensor normal(const at::Tensor & mean, double std=1, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::normal_Tensor_float::call(mean, std, generator);
+}
+
+// aten::normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & normal_out(at::Tensor & out, double mean, const at::Tensor & std, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::normal_float_Tensor_out::call(mean, std, generator, out);
+}
+// aten::normal.float_Tensor_out(float mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & normal_outf(double mean, const at::Tensor & std, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::normal_float_Tensor_out::call(mean, std, generator, out);
+}
+
+// aten::normal.float_Tensor(float mean, Tensor std, *, Generator? generator=None) -> Tensor
+inline at::Tensor normal(double mean, const at::Tensor & std, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::normal_float_Tensor::call(mean, std, generator);
+}
+
+// aten::normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & normal_out(at::Tensor & out, const at::Tensor & mean, const at::Tensor & std, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::normal_Tensor_Tensor_out::call(mean, std, generator, out);
+}
+// aten::normal.Tensor_Tensor_out(Tensor mean, Tensor std, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & normal_outf(const at::Tensor & mean, const at::Tensor & std, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::normal_Tensor_Tensor_out::call(mean, std, generator, out);
+}
+
+// aten::normal.Tensor_Tensor(Tensor mean, Tensor std, *, Generator? generator=None) -> Tensor
+inline at::Tensor normal(const at::Tensor & mean, const at::Tensor & std, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::normal_Tensor_Tensor::call(mean, std, generator);
+}
+
+// aten::normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor normal(double mean, double std, at::IntArrayRef size, c10::optional<at::Generator> generator=c10::nullopt, at::TensorOptions options={}) {
+    return at::_ops::normal_float_float::call(mean, std, c10::fromIntArrayRefSlow(size), generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor normal(double mean, double std, at::IntArrayRef size, c10::optional<at::Generator> generator=c10::nullopt, at::TensorOptions options={}) {
+    return at::_ops::normal_float_float::call(mean, std, c10::fromIntArrayRefSlow(size), generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+  }
+}
+
+// aten::normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor normal(double mean, double std, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    return at::_ops::normal_float_float::call(mean, std, c10::fromIntArrayRefSlow(size), generator, dtype, layout, device, pin_memory);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor normal(double mean, double std, at::IntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    return at::_ops::normal_float_float::call(mean, std, c10::fromIntArrayRefSlow(size), generator, dtype, layout, device, pin_memory);
+  }
+}
+
+// aten::normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor normal_symint(double mean, double std, c10::SymIntArrayRef size, c10::optional<at::Generator> generator=c10::nullopt, at::TensorOptions options={}) {
+    return at::_ops::normal_float_float::call(mean, std, size, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor normal(double mean, double std, c10::SymIntArrayRef size, c10::optional<at::Generator> generator=c10::nullopt, at::TensorOptions options={}) {
+    return at::_ops::normal_float_float::call(mean, std, size, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+  }
+}
+
+// aten::normal.float_float(float mean, float std, SymInt[] size, *, Generator? generator=None, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor normal_symint(double mean, double std, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    return at::_ops::normal_float_float::call(mean, std, size, generator, dtype, layout, device, pin_memory);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor normal(double mean, double std, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    return at::_ops::normal_float_float::call(mean, std, size, generator, dtype, layout, device, pin_memory);
+  }
+}
+
+// aten::normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & normal_out(at::Tensor & out, double mean, double std, at::IntArrayRef size, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::normal_float_float_out::call(mean, std, c10::fromIntArrayRefSlow(size), generator, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & normal_out(at::Tensor & out, double mean, double std, at::IntArrayRef size, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::normal_float_float_out::call(mean, std, c10::fromIntArrayRefSlow(size), generator, out);
+  }
+}
+
+// aten::normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & normal_outf(double mean, double std, at::IntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::normal_float_float_out::call(mean, std, c10::fromIntArrayRefSlow(size), generator, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & normal_outf(double mean, double std, at::IntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::normal_float_float_out::call(mean, std, c10::fromIntArrayRefSlow(size), generator, out);
+  }
+}
+
+// aten::normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & normal_symint_out(at::Tensor & out, double mean, double std, c10::SymIntArrayRef size, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::normal_float_float_out::call(mean, std, size, generator, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & normal_out(at::Tensor & out, double mean, double std, c10::SymIntArrayRef size, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::normal_float_float_out::call(mean, std, size, generator, out);
+  }
+}
+
+// aten::normal.float_float_out(float mean, float std, SymInt[] size, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & normal_symint_outf(double mean, double std, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::normal_float_float_out::call(mean, std, size, generator, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & normal_outf(double mean, double std, c10::SymIntArrayRef size, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::normal_float_float_out::call(mean, std, size, generator, out);
+  }
+}
+
+// aten::normal.out(Tensor self, float mean=0, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & normal_out(at::Tensor & out, const at::Tensor & self, double mean=0, double std=1, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::normal_out::call(self, mean, std, generator, out);
+}
+// aten::normal.out(Tensor self, float mean=0, float std=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & normal_outf(const at::Tensor & self, double mean, double std, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::normal_out::call(self, mean, std, generator, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/permute_copy_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/permute_copy_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfa7c81872489faadb894545b47cc4dd56be8a60
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/permute_copy_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & permute_copy_out(const at::Tensor & self, at::IntArrayRef dims, at::Tensor & out);
+TORCH_API at::Tensor permute_copy(const at::Tensor & self, at::IntArrayRef dims);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/prod_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/prod_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..50530de9e91b79bd05386ab4d3a73756a75fbf9b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/prod_cpu_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor prod(const at::Tensor & self, c10::optional<at::ScalarType> dtype=c10::nullopt);
+TORCH_API at::Tensor prod(const at::Tensor & self, int64_t dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt);
+TORCH_API at::Tensor & prod_out(at::Tensor & out, const at::Tensor & self, int64_t dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt);
+TORCH_API at::Tensor & prod_outf(const at::Tensor & self, int64_t dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/q_zero_point.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/q_zero_point.h
new file mode 100644
index 0000000000000000000000000000000000000000..83c3fdd1056961e0e10a8a97c9ff92e941c7cf8b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/q_zero_point.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/q_zero_point_ops.h>
+
+namespace at {
+
+
+// aten::q_zero_point(Tensor self) -> int
+inline int64_t q_zero_point(const at::Tensor & self) {
+    return at::_ops::q_zero_point::call(self);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/quantile_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/quantile_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d782a9477c7a586138c734aec7c359451f05554c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/quantile_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor quantile(const at::Tensor & self, const at::Tensor & q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear");
+TORCH_API at::Tensor & quantile_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear");
+TORCH_API at::Tensor & quantile_outf(const at::Tensor & self, const at::Tensor & q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation, at::Tensor & out);
+TORCH_API at::Tensor quantile(const at::Tensor & self, double q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear");
+TORCH_API at::Tensor & quantile_out(at::Tensor & out, const at::Tensor & self, double q, c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false, c10::string_view interpolation="linear");
+TORCH_API at::Tensor & quantile_outf(const at::Tensor & self, double q, c10::optional<int64_t> dim, bool keepdim, c10::string_view interpolation, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reciprocal_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reciprocal_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f92f978ab7381fa48d6fc0ba265a580444de53a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reciprocal_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API reciprocal {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::reciprocal")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "reciprocal(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API reciprocal_ {
+  using schema = at::Tensor & (at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::reciprocal_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "reciprocal_(Tensor(a!) self) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
+};
+
+struct TORCH_API reciprocal_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::reciprocal")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "reciprocal.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reflection_pad3d_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reflection_pad3d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..ee92d8a790a1291c8da8edf14fdeac7871896bd7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reflection_pad3d_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API reflection_pad3d_out {
+  using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::reflection_pad3d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "reflection_pad3d.out(Tensor self, SymInt[6] padding, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out);
+};
+
+struct TORCH_API reflection_pad3d {
+  using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::reflection_pad3d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "reflection_pad3d(Tensor self, SymInt[6] padding) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef padding);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sigmoid_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sigmoid_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b78740a4273630a4ce3f98bc77b04ad267b35cf
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sigmoid_cuda_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor sigmoid(const at::Tensor & self);
+TORCH_API at::Tensor & sigmoid_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & sigmoid_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & sigmoid_(at::Tensor & self);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_dilated3d_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_dilated3d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..51c83f1b07faf194616e92efdf3fc717b85ffc84
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_dilated3d_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & slow_conv_dilated3d_out_symint(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, at::Tensor & out);
+TORCH_API at::Tensor slow_conv_dilated3d_cpu(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef dilation=1);
+TORCH_API at::Tensor slow_conv_dilated3d_cuda(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef dilation=1);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_dilated3d_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_dilated3d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f3404005e865a3f3b9867aeee4888ad82fd4d40
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/slow_conv_dilated3d_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API slow_conv_dilated3d {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, const c10::optional<at::Tensor> &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::slow_conv_dilated3d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "slow_conv_dilated3d(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation);
+};
+
+struct TORCH_API slow_conv_dilated3d_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, const c10::optional<at::Tensor> &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::slow_conv_dilated3d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "slow_conv_dilated3d.out(Tensor self, Tensor weight, SymInt[3] kernel_size, Tensor? bias=None, SymInt[3] stride=1, SymInt[3] padding=0, SymInt[3] dilation=1, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softmax_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softmax_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..f43baad39cc2ab4b87c0299237a31c0938e2e3f6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softmax_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API softmax_int {
+  using schema = at::Tensor (const at::Tensor &, int64_t, c10::optional<at::ScalarType>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::softmax")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "int")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype);
+};
+
+struct TORCH_API softmax_int_out {
+  using schema = at::Tensor & (const at::Tensor &, int64_t, c10::optional<at::ScalarType>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::softmax")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "int_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "softmax.int_out(Tensor self, int dim, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t dim, c10::optional<at::ScalarType> dtype, at::Tensor & out);
+};
+
+struct TORCH_API softmax_Dimname {
+  using schema = at::Tensor (const at::Tensor &, at::Dimname, c10::optional<at::ScalarType>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::softmax")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Dimname")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname dim, c10::optional<at::ScalarType> dtype);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erf_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erf_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..863a10eb0deaa82d8bf8274f9de2e149ed74693f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erf_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor special_erf(const at::Tensor & self);
+TORCH_API at::Tensor & special_erf_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & special_erf_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_hermite_polynomial_h_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_hermite_polynomial_h_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b1592b95c0d94f71545326de31c8508cba91a5c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_hermite_polynomial_h_ops.h
@@ -0,0 +1,83 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API special_hermite_polynomial_h {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_hermite_polynomial_h")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_hermite_polynomial_h(Tensor x, Tensor n) -> Tensor")
+  static at::Tensor call(const at::Tensor & x, const at::Tensor & n);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n);
+};
+
+struct TORCH_API special_hermite_polynomial_h_x_scalar {
+  using schema = at::Tensor (const at::Scalar &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_hermite_polynomial_h")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "x_scalar")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_hermite_polynomial_h.x_scalar(Scalar x, Tensor n) -> Tensor")
+  static at::Tensor call(const at::Scalar & x, const at::Tensor & n);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n);
+};
+
+struct TORCH_API special_hermite_polynomial_h_n_scalar {
+  using schema = at::Tensor (const at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_hermite_polynomial_h")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "n_scalar")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_hermite_polynomial_h.n_scalar(Tensor x, Scalar n) -> Tensor")
+  static at::Tensor call(const at::Tensor & x, const at::Scalar & n);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n);
+};
+
+struct TORCH_API special_hermite_polynomial_h_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_hermite_polynomial_h")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_hermite_polynomial_h.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & x, const at::Tensor & n, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out);
+};
+
+struct TORCH_API special_hermite_polynomial_h_x_scalar_out {
+  using schema = at::Tensor & (const at::Scalar &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_hermite_polynomial_h")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "x_scalar_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_hermite_polynomial_h.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Scalar & x, const at::Tensor & n, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out);
+};
+
+struct TORCH_API special_hermite_polynomial_h_n_scalar_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_hermite_polynomial_h")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "n_scalar_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_hermite_polynomial_h.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & x, const at::Scalar & n, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sspaddmm_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sspaddmm_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a30dda6cc94c7e9aa95d7d83934cf4fb327d8a5f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sspaddmm_cpu_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor & sspaddmm_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1);
+TORCH_API at::Tensor & sspaddmm_outf(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sspaddmm_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sspaddmm_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..78e8443ae493b07f40d50c94b89cae125b633c73
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sspaddmm_native.h
@@ -0,0 +1,25 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor sspaddmm(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta=1, const at::Scalar & alpha=1);
+TORCH_API at::Tensor & _sspaddmm_out_only_sparse(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out);
+TORCH_API at::Tensor & _sspaddmm_out_only_sparse_cuda(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out);
+TORCH_API at::Tensor & _sspaddmm_out_cpu(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out);
+TORCH_API at::Tensor & _sspaddmm_out_cuda(const at::Tensor & self, const at::Tensor & mat1, const at::Tensor & mat2, const at::Scalar & beta, const at::Scalar & alpha, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/stft.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/stft.h
new file mode 100644
index 0000000000000000000000000000000000000000..acbf94d85abac02d6f6b832bd53af0637ff60097
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/stft.h
@@ -0,0 +1,35 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/stft_ops.h>
+
+namespace at {
+
+
+// aten::stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
+inline at::Tensor stft(const at::Tensor & self, int64_t n_fft, c10::optional<int64_t> hop_length, c10::optional<int64_t> win_length, const c10::optional<at::Tensor> & window, bool normalized, c10::optional<bool> onesided=c10::nullopt, c10::optional<bool> return_complex=c10::nullopt) {
+    return at::_ops::stft::call(self, n_fft, hop_length, win_length, window, normalized, onesided, return_complex);
+}
+
+// aten::stft.center(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool center=True, str pad_mode="reflect", bool normalized=False, bool? onesided=None, bool? return_complex=None) -> Tensor
+inline at::Tensor stft(const at::Tensor & self, int64_t n_fft, c10::optional<int64_t> hop_length=c10::nullopt, c10::optional<int64_t> win_length=c10::nullopt, const c10::optional<at::Tensor> & window={}, bool center=true, c10::string_view pad_mode="reflect", bool normalized=false, c10::optional<bool> onesided=c10::nullopt, c10::optional<bool> return_complex=c10::nullopt) {
+    return at::_ops::stft_center::call(self, n_fft, hop_length, win_length, window, center, pad_mode, normalized, onesided, return_complex);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sym_constrain_range_for_size_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sym_constrain_range_for_size_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..38a93fd70a70b6d4f0575bafa304d650981190dd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sym_constrain_range_for_size_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API void sym_constrain_range_for_size(const at::Scalar & size, c10::optional<int64_t> min=c10::nullopt, c10::optional<int64_t> max=c10::nullopt);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unique_dim_consecutive_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unique_dim_consecutive_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..490355337a43062f3a94e191f301819104d09f22
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unique_dim_consecutive_cuda_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> unique_dim_consecutive(const at::Tensor & self, int64_t dim, bool return_inverse=false, bool return_counts=false);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_trilinear3d_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_trilinear3d_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..28fdd6fe7939e822ead0766a47b4b943e8e1572a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_trilinear3d_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor upsample_trilinear3d(const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors);
+TORCH_API at::Tensor upsample_trilinear3d_symint(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, bool align_corners, c10::optional<at::ArrayRef<double>> scale_factors);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/view_as_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/view_as_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..8357f32a9336d32a6f6e969d87c4551a6eba345f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/view_as_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor view_as(const at::Tensor & self, const at::Tensor & other);
+} // namespace native
+} // namespace at