diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..769ba38dc3770fc2d98e307b466f17cc4615fad2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_native.h
@@ -0,0 +1,25 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & _adaptive_avg_pool2d_out_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out);
+TORCH_API at::Tensor adaptive_avg_pool2d_cpu(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API at::Tensor adaptive_avg_pool2d_cuda(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API at::Tensor adaptive_avg_pool2d_quantized_cpu(const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API at::Tensor adaptive_avg_pool2d_quantized_cuda(const at::Tensor & self, at::IntArrayRef output_size);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_coalesced.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_coalesced.h
new file mode 100644
index 0000000000000000000000000000000000000000..5499f0cd27c43e4def80f694e7bb06067edaf1f3
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_coalesced.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_coalesced_ops.h>
+
+namespace at {
+
+
+// aten::_coalesced.out(Tensor self, bool coalesced, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _coalesced_out(at::Tensor & out, const at::Tensor & self, bool coalesced) {
+    return at::_ops::_coalesced_out::call(self, coalesced, out);
+}
+// aten::_coalesced.out(Tensor self, bool coalesced, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _coalesced_outf(const at::Tensor & self, bool coalesced, at::Tensor & out) {
+    return at::_ops::_coalesced_out::call(self, coalesced, out);
+}
+
+// aten::_coalesced(Tensor self, bool coalesced) -> Tensor
+inline at::Tensor _coalesced(const at::Tensor & self, bool coalesced) {
+    return at::_ops::_coalesced::call(self, coalesced);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_convert_indices_from_csr_to_coo.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_convert_indices_from_csr_to_coo.h
new file mode 100644
index 0000000000000000000000000000000000000000..173f1d1f98c5d19dcd9da6ce48745f333c433331
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_convert_indices_from_csr_to_coo.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_convert_indices_from_csr_to_coo_ops.h>
+
+namespace at {
+
+
+// aten::_convert_indices_from_csr_to_coo(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False) -> Tensor
+inline at::Tensor _convert_indices_from_csr_to_coo(const at::Tensor & crow_indices, const at::Tensor & col_indices, bool out_int32=false, bool transpose=false) {
+    return at::_ops::_convert_indices_from_csr_to_coo::call(crow_indices, col_indices, out_int32, transpose);
+}
+
+// aten::_convert_indices_from_csr_to_coo.out(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _convert_indices_from_csr_to_coo_out(at::Tensor & out, const at::Tensor & crow_indices, const at::Tensor & col_indices, bool out_int32=false, bool transpose=false) {
+    return at::_ops::_convert_indices_from_csr_to_coo_out::call(crow_indices, col_indices, out_int32, transpose, out);
+}
+// aten::_convert_indices_from_csr_to_coo.out(Tensor crow_indices, Tensor col_indices, *, bool out_int32=False, bool transpose=False, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _convert_indices_from_csr_to_coo_outf(const at::Tensor & crow_indices, const at::Tensor & col_indices, bool out_int32, bool transpose, at::Tensor & out) {
+    return at::_ops::_convert_indices_from_csr_to_coo_out::call(crow_indices, col_indices, out_int32, transpose, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_dim_arange_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_dim_arange_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..f707679384af9f938a656ad83823cf9a6e60d4e3
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_dim_arange_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _dim_arange {
+  using schema = at::Tensor (const at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_dim_arange")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_dim_arange(Tensor like, int dim) -> Tensor")
+  static at::Tensor call(const at::Tensor & like, int64_t dim);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & like, int64_t dim);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_channel_affine.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_channel_affine.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f508cd08dea91fcbc1f7373caa6df486d52470c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_channel_affine.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_fake_quantize_learnable_per_channel_affine_ops.h>
+
+namespace at {
+
+
+// aten::_fake_quantize_learnable_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0) -> Tensor
+inline at::Tensor _fake_quantize_learnable_per_channel_affine(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor=1.0) {
+    return at::_ops::_fake_quantize_learnable_per_channel_affine::call(self, scale, zero_point, axis, quant_min, quant_max, grad_factor);
+}
+
+// aten::_fake_quantize_learnable_per_channel_affine.out(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _fake_quantize_learnable_per_channel_affine_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor=1.0) {
+    return at::_ops::_fake_quantize_learnable_per_channel_affine_out::call(self, scale, zero_point, axis, quant_min, quant_max, grad_factor, out);
+}
+// aten::_fake_quantize_learnable_per_channel_affine.out(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max, float grad_factor=1.0, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _fake_quantize_learnable_per_channel_affine_outf(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max, double grad_factor, at::Tensor & out) {
+    return at::_ops::_fake_quantize_learnable_per_channel_affine_out::call(self, scale, zero_point, axis, quant_min, quant_max, grad_factor, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_ceil_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_ceil_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8bfd3e0005d275acc723524d8c73dcc96ffabd6f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_ceil_cuda_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::vector<at::Tensor> _foreach_ceil(at::TensorList self);
+TORCH_API void _foreach_ceil_(at::TensorList self);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_erf_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_erf_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..939ef0a6c22bdd330e3418a2bcf1671e7d5c61a7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_erf_cpu_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API ::std::vector<at::Tensor> _foreach_erf(at::TensorList self);
+TORCH_API void _foreach_erf_(at::TensorList self);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c223519b05ca78fb02d1f673e4d660e188e6bf27
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_sign_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API void _foreach_sign_out(at::TensorList out, at::TensorList self);
+TORCH_API void _foreach_sign_outf(at::TensorList self, at::TensorList out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fused_moving_avg_obs_fq_helper_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fused_moving_avg_obs_fq_helper_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..80929c51e87de8f7a844b362c70faaad37a4fce0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fused_moving_avg_obs_fq_helper_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> _fused_moving_avg_obs_fq_helper_functional(const at::Tensor & self, const at::Tensor & observer_on, const at::Tensor & fake_quant_on, const at::Tensor & running_min, const at::Tensor & running_max, const at::Tensor & scale, const at::Tensor & zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, bool per_row_fake_quant=false, bool symmetric_quant=false);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> _fused_moving_avg_obs_fq_helper_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & self, const at::Tensor & observer_on, const at::Tensor & fake_quant_on, at::Tensor & running_min, at::Tensor & running_max, at::Tensor & scale, at::Tensor & zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, bool per_row_fake_quant=false, bool symmetric_quant=false);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> _fused_moving_avg_obs_fq_helper_outf(const at::Tensor & self, const at::Tensor & observer_on, const at::Tensor & fake_quant_on, at::Tensor & running_min, at::Tensor & running_max, at::Tensor & scale, at::Tensor & zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, bool per_row_fake_quant, bool symmetric_quant, at::Tensor & out0, at::Tensor & out1);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_local_scalar_dense.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_local_scalar_dense.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbe1f9fb40d8ddf2057546ba6414e2e487bbf0d9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_local_scalar_dense.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_local_scalar_dense_ops.h>
+
+namespace at {
+
+
+// aten::_local_scalar_dense(Tensor self) -> Scalar
+inline at::Scalar _local_scalar_dense(const at::Tensor & self) {
+    return at::_ops::_local_scalar_dense::call(self);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_lstm_mps_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_lstm_mps_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..afe96eeba989ce8642609c38e61f7759141c4606
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_lstm_mps_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &,at::Tensor &> _lstm_mps_out(const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, at::Tensor & out3, at::Tensor & out4, at::Tensor & out5);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sobol_engine_initialize_state_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sobol_engine_initialize_state_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..798fdf3be3d46d7078cc8019c36d889e9a8a306c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sobol_engine_initialize_state_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _sobol_engine_initialize_state_ {
+  using schema = at::Tensor & (at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sobol_engine_initialize_state_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sobol_engine_initialize_state_(Tensor(a!) self, int dimension) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, int64_t dimension);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t dimension);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_csr_sum_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_csr_sum_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2359ac61b7c874ebbe02474bb4a7b81bacb1c004
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_csr_sum_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & _sparse_csr_sum_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt);
+TORCH_API at::Tensor & _sparse_csr_sum_outf(const at::Tensor & self, at::IntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_stack_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_stack_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc1d4d74b83e5dba02b2cfa734f7b4f7d4caf0e8
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_stack_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor _stack(at::TensorList tensors, int64_t dim=0);
+TORCH_API at::Tensor & _stack_out(at::Tensor & out, at::TensorList tensors, int64_t dim=0);
+TORCH_API at::Tensor & _stack_outf(at::TensorList tensors, int64_t dim, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_bsc_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_bsc_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..938209e8c7cf4b39b0ebc65a6afbe50a82214370
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_bsc_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & _to_sparse_bsc_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim=c10::nullopt);
+TORCH_API at::Tensor & _to_sparse_bsc_outf(const at::Tensor & self, at::IntArrayRef blocksize, c10::optional<int64_t> dense_dim, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_backward_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_backward_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8a72a6838131ebd0fc0d8faea990555e211a9d6e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_nearest_exact1d_backward_cuda_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor _upsample_nearest_exact1d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales=c10::nullopt);
+TORCH_API at::Tensor _upsample_nearest_exact1d_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales=c10::nullopt);
+TORCH_API at::Tensor & _upsample_nearest_exact1d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales=c10::nullopt);
+TORCH_API at::Tensor & _upsample_nearest_exact1d_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, c10::optional<double> scales, at::Tensor & grad_input);
+TORCH_API at::Tensor & _upsample_nearest_exact1d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales=c10::nullopt);
+TORCH_API at::Tensor & _upsample_nearest_exact1d_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, c10::optional<double> scales, at::Tensor & grad_input);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_sparse_bsc_tensor_args_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_sparse_bsc_tensor_args_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a97f582d8b31b0df20969f78c05cdb66098c2b2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_sparse_bsc_tensor_args_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API void _validate_sparse_bsc_tensor_args(const at::Tensor & ccol_indices, const at::Tensor & row_indices, const at::Tensor & values, at::IntArrayRef size);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..05fe922a69a6c7a24ff0587d77b0c982d70aca61
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_max_pool1d_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> adaptive_max_pool1d(const at::Tensor & self, at::IntArrayRef output_size);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/addcdiv_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/addcdiv_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cccb6190b5c1a61c1b8f0c38a87618f545ad83c7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/addcdiv_meta_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor addcdiv(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+TORCH_API at::Tensor & addcdiv_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+TORCH_API at::Tensor & addcdiv_outf(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, at::Tensor & out);
+TORCH_API at::Tensor & addcdiv_(at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value=1);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/addcdiv_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/addcdiv_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..578cf7d73b56de517af031db160113bbf79d5a8e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/addcdiv_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/addcdiv_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_addcdiv_out : public at::meta::structured_addcdiv {
+void impl(const at::Tensor & self, const at::Tensor & tensor1, const at::Tensor & tensor2, const at::Scalar & value, const at::Tensor & out);
+};
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/all_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/all_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c01775a93f2dcdd355309f356deea3137efc5e1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/all_cpu_dispatch.h
@@ -0,0 +1,31 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor all(const at::Tensor & self, int64_t dim, bool keepdim=false);
+TORCH_API at::Tensor & all_out(at::Tensor & out, const at::Tensor & self, int64_t dim, bool keepdim=false);
+TORCH_API at::Tensor & all_outf(const at::Tensor & self, int64_t dim, bool keepdim, at::Tensor & out);
+TORCH_API at::Tensor all(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false);
+TORCH_API at::Tensor & all_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false);
+TORCH_API at::Tensor & all_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, at::Tensor & out);
+TORCH_API at::Tensor all(const at::Tensor & self);
+TORCH_API at::Tensor & all_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & all_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arccosh.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arccosh.h
new file mode 100644
index 0000000000000000000000000000000000000000..3335e73c2e0cd6ccabe35e595440e4940b9be439
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arccosh.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/arccosh_ops.h>
+
+namespace at {
+
+
+// aten::arccosh(Tensor self) -> Tensor
+inline at::Tensor arccosh(const at::Tensor & self) {
+    return at::_ops::arccosh::call(self);
+}
+
+// aten::arccosh_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & arccosh_(at::Tensor & self) {
+    return at::_ops::arccosh_::call(self);
+}
+
+// aten::arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & arccosh_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::arccosh_out::call(self, out);
+}
+// aten::arccosh.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & arccosh_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::arccosh_out::call(self, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atan_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atan_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2429ecf6be19d32a6f7dd85bbfc725389e501b74
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/atan_cuda_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor atan(const at::Tensor & self);
+TORCH_API at::Tensor & atan_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & atan_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & atan_(at::Tensor & self);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/batch_norm_backward_elemt_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/batch_norm_backward_elemt_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bcb852633608c23608b99fb457032332ff03b85
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/batch_norm_backward_elemt_cuda_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor batch_norm_backward_elemt(const at::Tensor & grad_out, const at::Tensor & input, const at::Tensor & mean, const at::Tensor & invstd, const c10::optional<at::Tensor> & weight, const at::Tensor & sum_dy, const at::Tensor & sum_dy_xmu, const at::Tensor & count);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bilinear_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bilinear_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3cdb8cb81a724dc6f5666220c803506533cc00b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bilinear_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor bilinear(const at::Tensor & input1, const at::Tensor & input2, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={});
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/concatenate_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/concatenate_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bf7b7b8e829d39a4d534dcf88629ac06cadf580
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/concatenate_native.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor concatenate(at::TensorList tensors, int64_t dim=0);
+TORCH_API at::Tensor & concatenate_out(at::TensorList tensors, int64_t dim, at::Tensor & out);
+TORCH_API at::Tensor concatenate(at::TensorList tensors, at::Dimname dim);
+TORCH_API at::Tensor & concatenate_out(at::TensorList tensors, at::Dimname dim, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/conv1d_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/conv1d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1188f80b837632a7d9596838046567cf5067d94
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/conv1d_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API conv1d {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const c10::optional<at::Tensor> &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::conv1d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "conv1d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, SymInt[1] padding=0, SymInt[1] dilation=1, SymInt groups=1) -> Tensor")
+  static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups);
+};
+
+struct TORCH_API conv1d_padding {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const c10::optional<at::Tensor> &, c10::SymIntArrayRef, c10::string_view, c10::SymIntArrayRef, c10::SymInt);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::conv1d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "padding")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[1] stride=1, str padding=\"valid\", SymInt[1] dilation=1, SymInt groups=1) -> Tensor")
+  static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/conv2d.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/conv2d.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5cc39919d544f44c73ed313a5b5e057a4246653
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/conv2d.h
@@ -0,0 +1,69 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/conv2d_ops.h>
+
+namespace at {
+
+
+// aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+inline at::Tensor conv2d(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, int64_t groups=1) {
+    return at::_ops::conv2d::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor conv2d(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, int64_t groups=1) {
+    return at::_ops::conv2d::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), groups);
+  }
+}
+
+// aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+inline at::Tensor conv2d_symint(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1) {
+    return at::_ops::conv2d::call(input, weight, bias, stride, padding, dilation, groups);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor conv2d(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1) {
+    return at::_ops::conv2d::call(input, weight, bias, stride, padding, dilation, groups);
+  }
+}
+
+// aten::conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, str padding="valid", SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+inline at::Tensor conv2d(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, c10::string_view padding, at::IntArrayRef dilation=1, int64_t groups=1) {
+    return at::_ops::conv2d_padding::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), padding, c10::fromIntArrayRefSlow(dilation), groups);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor conv2d(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, at::IntArrayRef stride, c10::string_view padding, at::IntArrayRef dilation=1, int64_t groups=1) {
+    return at::_ops::conv2d_padding::call(input, weight, bias, c10::fromIntArrayRefSlow(stride), padding, c10::fromIntArrayRefSlow(dilation), groups);
+  }
+}
+
+// aten::conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, str padding="valid", SymInt[2] dilation=1, SymInt groups=1) -> Tensor
+inline at::Tensor conv2d_symint(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1) {
+    return at::_ops::conv2d_padding::call(input, weight, bias, stride, padding, dilation, groups);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor conv2d(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation=c10::SymInt(1), c10::SymInt groups=1) {
+    return at::_ops::conv2d_padding::call(input, weight, bias, stride, padding, dilation, groups);
+  }
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/conv2d_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/conv2d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..33c6ce9de803ad7735391a75027dd79906827958
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/conv2d_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API conv2d {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const c10::optional<at::Tensor> &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::conv2d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "conv2d(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, SymInt[2] padding=0, SymInt[2] dilation=1, SymInt groups=1) -> Tensor")
+  static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups);
+};
+
+struct TORCH_API conv2d_padding {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const c10::optional<at::Tensor> &, c10::SymIntArrayRef, c10::string_view, c10::SymIntArrayRef, c10::SymInt);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::conv2d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "padding")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, SymInt[2] stride=1, str padding=\"valid\", SymInt[2] dilation=1, SymInt groups=1) -> Tensor")
+  static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cross_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cross_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..de9b36bbeda4da429980947514e60ba563d85f46
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cross_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor cross(const at::Tensor & self, const at::Tensor & other, c10::optional<int64_t> dim=c10::nullopt);
+TORCH_API at::Tensor & cross_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, c10::optional<int64_t> dim=c10::nullopt);
+TORCH_API at::Tensor & cross_outf(const at::Tensor & self, const at::Tensor & other, c10::optional<int64_t> dim, at::Tensor & out);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/elu_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/elu_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d308a207af09ab4cda49ede0734464c36ef869c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/elu_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API elu_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, const at::Scalar &, const at::Scalar &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::elu")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "elu.out(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale, at::Tensor & out);
+};
+
+struct TORCH_API elu {
+  using schema = at::Tensor (const at::Tensor &, const at::Scalar &, const at::Scalar &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::elu")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale);
+};
+
+struct TORCH_API elu_ {
+  using schema = at::Tensor & (at::Tensor &, const at::Scalar &, const at::Scalar &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::elu_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "elu_(Tensor(a!) self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, const at::Scalar & alpha, const at::Scalar & scale, const at::Scalar & input_scale);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fft_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fft_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a8190caa08b71c64bf8453fd2cecce9f3ce512f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fft_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor fft_fft_symint(const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt);
+TORCH_API at::Tensor & fft_fft_symint_out(const at::Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fftshift_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fftshift_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..839ca021a29fdc04d8dbbfbf37a7c7276cfdf297
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fftshift_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor fft_fftshift(const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_ifftshift.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_ifftshift.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd25d2b27dbbd3a2c8000b5440a770787780fe1b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_ifftshift.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/fft_ifftshift_ops.h>
+
+namespace at {
+
+
+// aten::fft_ifftshift(Tensor self, int[1]? dim=None) -> Tensor
+inline at::Tensor fft_ifftshift(const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt) {
+    return at::_ops::fft_ifftshift::call(self, dim);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/flatten_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/flatten_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..a982886ce6cc22541980e2755cd531a4e278e2c6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/flatten_ops.h
@@ -0,0 +1,61 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API flatten_using_ints {
+  using schema = at::Tensor (const at::Tensor &, int64_t, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::flatten")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "using_ints")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)")
+  static at::Tensor call(const at::Tensor & self, int64_t start_dim, int64_t end_dim);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t start_dim, int64_t end_dim);
+};
+
+struct TORCH_API flatten_named_out_dim {
+  using schema = at::Tensor (const at::Tensor &, int64_t, int64_t, at::Dimname);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::flatten")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "named_out_dim")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "flatten.named_out_dim(Tensor(a) self, int start_dim, int end_dim, Dimname out_dim) -> Tensor(a)")
+  static at::Tensor call(const at::Tensor & self, int64_t start_dim, int64_t end_dim, at::Dimname out_dim);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t start_dim, int64_t end_dim, at::Dimname out_dim);
+};
+
+struct TORCH_API flatten_using_names {
+  using schema = at::Tensor (const at::Tensor &, at::Dimname, at::Dimname, at::Dimname);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::flatten")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "using_names")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "flatten.using_names(Tensor(a) self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor(a)")
+  static at::Tensor call(const at::Tensor & self, at::Dimname start_dim, at::Dimname end_dim, at::Dimname out_dim);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Dimname start_dim, at::Dimname end_dim, at::Dimname out_dim);
+};
+
+struct TORCH_API flatten_DimnameList {
+  using schema = at::Tensor (const at::Tensor &, at::DimnameList, at::Dimname);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::flatten")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "DimnameList")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "flatten.DimnameList(Tensor(a) self, Dimname[] dims, Dimname out_dim) -> Tensor(a)")
+  static at::Tensor call(const at::Tensor & self, at::DimnameList dims, at::Dimname out_dim);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::DimnameList dims, at::Dimname out_dim);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/from_file_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/from_file_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9f6a41e95d7df479da9eaf13fec45bb1017604b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/from_file_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & from_file_out(at::Tensor & out, c10::string_view filename, c10::optional<bool> shared=c10::nullopt, c10::optional<int64_t> size=0);
+TORCH_API at::Tensor & from_file_outf(c10::string_view filename, c10::optional<bool> shared, c10::optional<int64_t> size, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/greater_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/greater_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..87975e91d719d2773de635bf566799e4d315e9bb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/greater_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,30 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor greater(const at::Tensor & self, const at::Scalar & other);
+TORCH_API at::Tensor & greater_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other);
+TORCH_API at::Tensor & greater_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out);
+TORCH_API at::Tensor & greater_(at::Tensor & self, const at::Scalar & other);
+TORCH_API at::Tensor greater(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & greater_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & greater_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+TORCH_API at::Tensor & greater_(at::Tensor & self, const at::Tensor & other);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/igamma_compositeexplicitautogradnonfunctional_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/igamma_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..993156e810d1e7fb1576fda00cf1855602473f28
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/igamma_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor igamma(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & igamma_(at::Tensor & self, const at::Tensor & other);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/inverse_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/inverse_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd884d762101fded87d988f8f24dc7c9582a3104
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/inverse_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API inverse {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::inverse")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "inverse(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API inverse_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::inverse")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "inverse.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_set_to.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_set_to.h
new file mode 100644
index 0000000000000000000000000000000000000000..a179cdd64ae896c2bbb3d37861cdb83326f19b46
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_set_to.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/is_set_to_ops.h>
+
+namespace at {
+
+
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_signed_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_signed_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7ee7ff1358cd217a41ffcbcd86f24bd22f961c0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_signed_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API is_signed {
+  using schema = bool (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::is_signed")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "is_signed(Tensor self) -> bool")
+  static bool call(const at::Tensor & self);
+  static bool redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_cross_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_cross_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d633f2aa6510a5b0dbc0885e0754681b125b13f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_cross_meta_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor linalg_cross(const at::Tensor & self, const at::Tensor & other, int64_t dim=-1);
+TORCH_API at::Tensor & linalg_cross_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other, int64_t dim=-1);
+TORCH_API at::Tensor & linalg_cross_outf(const at::Tensor & self, const at::Tensor & other, int64_t dim, at::Tensor & out);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_eigh_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_eigh_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6f4f3423d58435bef913eb243e5618540dab8de
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_eigh_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> linalg_eigh(const at::Tensor & self, c10::string_view UPLO="L");
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> linalg_eigh_out(const at::Tensor & self, c10::string_view UPLO, at::Tensor & eigvals, at::Tensor & eigvecs);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_solve_triangular_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_solve_triangular_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..79be6ba3e2f32dc84ffbfa214fe7736e6a4c9ed7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_solve_triangular_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor linalg_solve_triangular(const at::Tensor & self, const at::Tensor & B, bool upper, bool left=true, bool unitriangular=false);
+TORCH_API at::Tensor & linalg_solve_triangular_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & B, bool upper, bool left=true, bool unitriangular=false);
+TORCH_API at::Tensor & linalg_solve_triangular_outf(const at::Tensor & self, const at::Tensor & B, bool upper, bool left, bool unitriangular, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lstm_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lstm_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..defb4de9f624422ee47c381a1ae2b13b9b25bae9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lstm_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> lstm(const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first);
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> lstm(const at::Tensor & data, const at::Tensor & batch_sizes, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d7a57759cc4495ee539be01ebd6afc315f3db2d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_meta_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> max_pool2d_with_indices(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> max_pool2d_with_indices_out(at::Tensor & out, at::Tensor & indices, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> max_pool2d_with_indices_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out, at::Tensor & indices);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_batch_norm_backward_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_batch_norm_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e71a5f367e8d38ce7b30aa1c1e387f04cc308c7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_batch_norm_backward_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API miopen_batch_norm_backward {
+  using schema = ::std::tuple<at::Tensor,at::Tensor,at::Tensor> (const at::Tensor &, const at::Tensor &, const at::Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, double);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::miopen_batch_norm_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor)")
+  static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> call(const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_var, double epsilon);
+  static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_var, double epsilon);
+};
+
+struct TORCH_API miopen_batch_norm_backward_out {
+  using schema = ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> (const at::Tensor &, const at::Tensor &, const at::Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, double, at::Tensor &, at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::miopen_batch_norm_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "miopen_batch_norm_backward.out(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))")
+  static ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> call(const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_var, double epsilon, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2);
+  static ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & grad_output, const at::Tensor & weight, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, const c10::optional<at::Tensor> & save_mean, const c10::optional<at::Tensor> & save_var, double epsilon, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/msort_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/msort_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..9267b868350507a795e67689b743d512e897f1b6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/msort_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API msort_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::msort")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "msort.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+struct TORCH_API msort {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::msort")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "msort(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multi_margin_loss_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multi_margin_loss_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..692c9b58c8e44ca80206de0c434b7875f60e7258
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multi_margin_loss_native.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor multi_margin_loss_cpu(const at::Tensor & self, const at::Tensor & target, const at::Scalar & p=1, const at::Scalar & margin=1, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean);
+TORCH_API at::Tensor & multi_margin_loss_cpu_out(const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const c10::optional<at::Tensor> & weight, int64_t reduction, at::Tensor & out);
+TORCH_API at::Tensor multi_margin_loss_cuda(const at::Tensor & self, const at::Tensor & target, const at::Scalar & p=1, const at::Scalar & margin=1, const c10::optional<at::Tensor> & weight={}, int64_t reduction=at::Reduction::Mean);
+TORCH_API at::Tensor & multi_margin_loss_cuda_out(const at::Tensor & self, const at::Tensor & target, const at::Scalar & p, const at::Scalar & margin, const c10::optional<at::Tensor> & weight, int64_t reduction, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nan_to_num_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nan_to_num_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..17a27eec3e977d97548ff03a40ec30146e048efb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nan_to_num_cpu_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor & nan_to_num_out(at::Tensor & out, const at::Tensor & self, c10::optional<double> nan=c10::nullopt, c10::optional<double> posinf=c10::nullopt, c10::optional<double> neginf=c10::nullopt);
+TORCH_API at::Tensor & nan_to_num_outf(const at::Tensor & self, c10::optional<double> nan, c10::optional<double> posinf, c10::optional<double> neginf, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/narrow_copy_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/narrow_copy_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..5eec61a5d0901e7005732bd22167ec529da433d0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/narrow_copy_cpu_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor narrow_copy(const at::Tensor & self, int64_t dim, int64_t start, int64_t length);
+TORCH_API at::Tensor narrow_copy_symint(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length);
+TORCH_API at::Tensor & narrow_copy_out(at::Tensor & out, const at::Tensor & self, int64_t dim, int64_t start, int64_t length);
+TORCH_API at::Tensor & narrow_copy_outf(const at::Tensor & self, int64_t dim, int64_t start, int64_t length, at::Tensor & out);
+TORCH_API at::Tensor & narrow_copy_symint_out(at::Tensor & out, const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length);
+TORCH_API at::Tensor & narrow_copy_symint_outf(const at::Tensor & self, int64_t dim, c10::SymInt start, c10::SymInt length, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_layer_norm.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_layer_norm.h
new file mode 100644
index 0000000000000000000000000000000000000000..336eb8bf21d61b6999cb5b52ddbcc9b597945211
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/native_layer_norm.h
@@ -0,0 +1,91 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/native_layer_norm_ops.h>
+
+namespace at {
+
+
+// aten::native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
+    return at::_ops::native_layer_norm::call(input, c10::fromIntArrayRefSlow(normalized_shape), weight, bias, eps);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm(const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
+    return at::_ops::native_layer_norm::call(input, c10::fromIntArrayRefSlow(normalized_shape), weight, bias, eps);
+  }
+}
+
+// aten::native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm_symint(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
+    return at::_ops::native_layer_norm::call(input, normalized_shape, weight, bias, eps);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  ::std::tuple<at::Tensor,at::Tensor,at::Tensor> native_layer_norm(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
+    return at::_ops::native_layer_norm::call(input, normalized_shape, weight, bias, eps);
+  }
+}
+
+// aten::native_layer_norm.out(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
+    return at::_ops::native_layer_norm_out::call(input, c10::fromIntArrayRefSlow(normalized_shape), weight, bias, eps, out0, out1, out2);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
+    return at::_ops::native_layer_norm_out::call(input, c10::fromIntArrayRefSlow(normalized_shape), weight, bias, eps, out0, out1, out2);
+  }
+}
+
+// aten::native_layer_norm.out(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_outf(const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+    return at::_ops::native_layer_norm_out::call(input, c10::fromIntArrayRefSlow(normalized_shape), weight, bias, eps, out0, out1, out2);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_outf(const at::Tensor & input, at::IntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+    return at::_ops::native_layer_norm_out::call(input, c10::fromIntArrayRefSlow(normalized_shape), weight, bias, eps, out0, out1, out2);
+  }
+}
+
+// aten::native_layer_norm.out(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_symint_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
+    return at::_ops::native_layer_norm_out::call(input, normalized_shape, weight, bias, eps, out0, out1, out2);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps) {
+    return at::_ops::native_layer_norm_out::call(input, normalized_shape, weight, bias, eps, out0, out1, out2);
+  }
+}
+
+// aten::native_layer_norm.out(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))
+inline ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_symint_outf(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+    return at::_ops::native_layer_norm_out::call(input, normalized_shape, weight, bias, eps, out0, out1, out2);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> native_layer_norm_outf(const at::Tensor & input, c10::SymIntArrayRef normalized_shape, const c10::optional<at::Tensor> & weight, const c10::optional<at::Tensor> & bias, double eps, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2) {
+    return at::_ops::native_layer_norm_out::call(input, normalized_shape, weight, bias, eps, out0, out1, out2);
+  }
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/one_hot_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/one_hot_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..5be948361d64c577972d38d4146266963bb55b0a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/one_hot_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor one_hot(const at::Tensor & self, int64_t num_classes=-1);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/one_hot_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/one_hot_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..f26843c9183189eb2966f6722df11fa8b0c34617
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/one_hot_native.h
@@ -0,0 +1,21 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor one_hot(const at::Tensor & self, int64_t num_classes=-1);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/polygamma_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/polygamma_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..76f8045fc928132c6e03d2b045102282ae8c8486
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/polygamma_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API polygamma_out {
+  using schema = at::Tensor & (int64_t, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::polygamma")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "polygamma.out(int n, Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(int64_t n, const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, int64_t n, const at::Tensor & self, at::Tensor & out);
+};
+
+struct TORCH_API polygamma {
+  using schema = at::Tensor (int64_t, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::polygamma")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "polygamma(int n, Tensor self) -> Tensor")
+  static at::Tensor call(int64_t n, const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, int64_t n, const at::Tensor & self);
+};
+
+struct TORCH_API polygamma_ {
+  using schema = at::Tensor & (at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::polygamma_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "polygamma_(Tensor(a!) self, int n) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, int64_t n);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t n);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/promote_types_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/promote_types_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..619382018766ebc69ea59f1425a92971c9483a15
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/promote_types_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::ScalarType promote_types(at::ScalarType type1, at::ScalarType type2);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/random_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/random_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..07a82b0fcb4eed29b8f41db84392f667502226cb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/random_ops.h
@@ -0,0 +1,116 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API random__from {
+  using schema = at::Tensor & (at::Tensor &, int64_t, c10::optional<int64_t>, c10::optional<at::Generator>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::random_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "from")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "random_.from(Tensor(a!) self, int from, int? to, *, Generator? generator=None) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator);
+};
+
+struct TORCH_API random__to {
+  using schema = at::Tensor & (at::Tensor &, int64_t, c10::optional<at::Generator>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::random_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "to")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "random_.to(Tensor(a!) self, int to, *, Generator? generator=None) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, int64_t to, c10::optional<at::Generator> generator);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, int64_t to, c10::optional<at::Generator> generator);
+};
+
+struct TORCH_API random_ {
+  using schema = at::Tensor & (at::Tensor &, c10::optional<at::Generator>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::random_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "random_(Tensor(a!) self, *, Generator? generator=None) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self, c10::optional<at::Generator> generator);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self, c10::optional<at::Generator> generator);
+};
+
+struct TORCH_API random_from_out {
+  using schema = at::Tensor & (const at::Tensor &, int64_t, c10::optional<int64_t>, c10::optional<at::Generator>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::random")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "from_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "random.from_out(Tensor self, int from, int? to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator, at::Tensor & out);
+};
+
+struct TORCH_API random_from {
+  using schema = at::Tensor (const at::Tensor &, int64_t, c10::optional<int64_t>, c10::optional<at::Generator>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::random")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "from")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "random.from(Tensor self, int from, int? to, *, Generator? generator=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t from, c10::optional<int64_t> to, c10::optional<at::Generator> generator);
+};
+
+struct TORCH_API random_to_out {
+  using schema = at::Tensor & (const at::Tensor &, int64_t, c10::optional<at::Generator>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::random")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "to_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "random.to_out(Tensor self, int to, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, int64_t to, c10::optional<at::Generator> generator, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t to, c10::optional<at::Generator> generator, at::Tensor & out);
+};
+
+struct TORCH_API random_to {
+  using schema = at::Tensor (const at::Tensor &, int64_t, c10::optional<at::Generator>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::random")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "to")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "random.to(Tensor self, int to, *, Generator? generator=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, int64_t to, c10::optional<at::Generator> generator);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t to, c10::optional<at::Generator> generator);
+};
+
+struct TORCH_API random_out {
+  using schema = at::Tensor & (const at::Tensor &, c10::optional<at::Generator>, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::random")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "random.out(Tensor self, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, c10::optional<at::Generator> generator, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Generator> generator, at::Tensor & out);
+};
+
+struct TORCH_API random {
+  using schema = at::Tensor (const at::Tensor &, c10::optional<at::Generator>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::random")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "random(Tensor self, *, Generator? generator=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, c10::optional<at::Generator> generator);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional<at::Generator> generator);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu6_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu6_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8d1125641da24227830042c5fa010460c47440f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu6_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor relu6(const at::Tensor & self);
+TORCH_API at::Tensor & relu6_(at::Tensor & self);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..a252ee3ef0bd969bdf20e38fc5c4258836ed02f7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API replication_pad1d_out {
+  using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::replication_pad1d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out);
+};
+
+struct TORCH_API replication_pad1d {
+  using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::replication_pad1d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef padding);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef padding);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad2d_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad2d_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4465aa774fce171038b5677c963def97ae61a5bf
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad2d_cuda_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor replication_pad2d(const at::Tensor & self, at::IntArrayRef padding);
+TORCH_API at::Tensor replication_pad2d_symint(const at::Tensor & self, c10::SymIntArrayRef padding);
+TORCH_API at::Tensor & replication_pad2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding);
+TORCH_API at::Tensor & replication_pad2d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out);
+TORCH_API at::Tensor & replication_pad2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding);
+TORCH_API at::Tensor & replication_pad2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/round_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/round_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..07695faecbd2b71bf922674e1db7ad923a29bc02
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/round_native.h
@@ -0,0 +1,32 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/round_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_round_out : public at::meta::structured_round {
+void impl(const at::Tensor & self, const at::Tensor & out);
+};
+TORCH_API at::Tensor round_sparse(const at::Tensor & self);
+TORCH_API at::Tensor & round_sparse_out(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & round_sparse_(at::Tensor & self);
+TORCH_API at::Tensor round_sparse_csr(const at::Tensor & self);
+TORCH_API at::Tensor & round_sparse_csr_out(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & round_sparse_csr_(at::Tensor & self);
+struct TORCH_API structured_round_decimals_out : public at::meta::structured_round_decimals {
+void impl(const at::Tensor & self, int64_t decimals, const at::Tensor & out);
+};
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/searchsorted_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/searchsorted_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..40811b480432312356575c1d610d74cda950993b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/searchsorted_cuda_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor searchsorted(const at::Tensor & sorted_sequence, const at::Tensor & self, bool out_int32=false, bool right=false, c10::optional<c10::string_view> side=c10::nullopt, const c10::optional<at::Tensor> & sorter={});
+TORCH_API at::Tensor & searchsorted_out(at::Tensor & out, const at::Tensor & sorted_sequence, const at::Tensor & self, bool out_int32=false, bool right=false, c10::optional<c10::string_view> side=c10::nullopt, const c10::optional<at::Tensor> & sorter={});
+TORCH_API at::Tensor & searchsorted_outf(const at::Tensor & sorted_sequence, const at::Tensor & self, bool out_int32, bool right, c10::optional<c10::string_view> side, const c10::optional<at::Tensor> & sorter, at::Tensor & out);
+TORCH_API at::Tensor searchsorted(const at::Tensor & sorted_sequence, const at::Scalar & self, bool out_int32=false, bool right=false, c10::optional<c10::string_view> side=c10::nullopt, const c10::optional<at::Tensor> & sorter={});
+TORCH_API at::Tensor & searchsorted_out(at::Tensor & out, const at::Tensor & sorted_sequence, const at::Scalar & self, bool out_int32=false, bool right=false, c10::optional<c10::string_view> side=c10::nullopt, const c10::optional<at::Tensor> & sorter={});
+TORCH_API at::Tensor & searchsorted_outf(const at::Tensor & sorted_sequence, const at::Scalar & self, bool out_int32, bool right, c10::optional<c10::string_view> side, const c10::optional<at::Tensor> & sorter, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sgn.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sgn.h
new file mode 100644
index 0000000000000000000000000000000000000000..9365ebd7cce663cf20593a674a455c6e0008e848
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sgn.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/sgn_ops.h>
+
+namespace at {
+
+
+// aten::sgn(Tensor self) -> Tensor
+inline at::Tensor sgn(const at::Tensor & self) {
+    return at::_ops::sgn::call(self);
+}
+
+// aten::sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & sgn_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::sgn_out::call(self, out);
+}
+// aten::sgn.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & sgn_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::sgn_out::call(self, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/soft_margin_loss_backward_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/soft_margin_loss_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..6044e0969ba6848205c5ea7813907ac565c76c64
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/soft_margin_loss_backward_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API soft_margin_loss_backward_grad_input {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::soft_margin_loss_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "grad_input")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "soft_margin_loss_backward.grad_input(Tensor grad_output, Tensor self, Tensor target, int reduction, *, Tensor(a!) grad_input) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & grad_input);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, at::Tensor & grad_input);
+};
+
+struct TORCH_API soft_margin_loss_backward {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::soft_margin_loss_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "soft_margin_loss_backward(Tensor grad_output, Tensor self, Tensor target, int reduction) -> Tensor")
+  static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softshrink_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softshrink_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e208d5f4b4c2664bf3b4355299f08a03b7d470d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softshrink_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API softshrink_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::softshrink")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "softshrink.out(Tensor self, Scalar lambd=0.5, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, const at::Scalar & lambd, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & lambd, at::Tensor & out);
+};
+
+struct TORCH_API softshrink {
+  using schema = at::Tensor (const at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::softshrink")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "softshrink(Tensor self, Scalar lambd=0.5) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Scalar & lambd);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Scalar & lambd);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_i0e_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_i0e_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..eed4e1cff11d87ac33d0b6eb36bddfa644c45f31
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_i0e_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor special_i0e(const at::Tensor & self);
+TORCH_API at::Tensor & special_i0e_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & special_i0e_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..04aedbe51b8f90414698254408cc241fcd796460
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_laguerre_polynomial_l_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor special_laguerre_polynomial_l(const at::Tensor & x, const at::Tensor & n);
+TORCH_API at::Tensor & special_laguerre_polynomial_l_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n);
+TORCH_API at::Tensor & special_laguerre_polynomial_l_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_log1p.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_log1p.h
new file mode 100644
index 0000000000000000000000000000000000000000..06d83f91b0e0d7a56ec1ec5f7caa5940d9f8ebd7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_log1p.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/special_log1p_ops.h>
+
+namespace at {
+
+
+// aten::special_log1p(Tensor self) -> Tensor
+inline at::Tensor special_log1p(const at::Tensor & self) {
+    return at::_ops::special_log1p::call(self);
+}
+
+// aten::special_log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & special_log1p_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::special_log1p_out::call(self, out);
+}
+// aten::special_log1p.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & special_log1p_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::special_log1p_out::call(self, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_v_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_v_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f9f33d83457601d3b2f9c2fd46d4efa8812d386
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_shifted_chebyshev_polynomial_v_ops.h
@@ -0,0 +1,83 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API special_shifted_chebyshev_polynomial_v {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_shifted_chebyshev_polynomial_v")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_shifted_chebyshev_polynomial_v(Tensor x, Tensor n) -> Tensor")
+  static at::Tensor call(const at::Tensor & x, const at::Tensor & n);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n);
+};
+
+struct TORCH_API special_shifted_chebyshev_polynomial_v_x_scalar {
+  using schema = at::Tensor (const at::Scalar &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_shifted_chebyshev_polynomial_v")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "x_scalar")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_shifted_chebyshev_polynomial_v.x_scalar(Scalar x, Tensor n) -> Tensor")
+  static at::Tensor call(const at::Scalar & x, const at::Tensor & n);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n);
+};
+
+struct TORCH_API special_shifted_chebyshev_polynomial_v_n_scalar {
+  using schema = at::Tensor (const at::Tensor &, const at::Scalar &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_shifted_chebyshev_polynomial_v")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "n_scalar")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_shifted_chebyshev_polynomial_v.n_scalar(Tensor x, Scalar n) -> Tensor")
+  static at::Tensor call(const at::Tensor & x, const at::Scalar & n);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n);
+};
+
+struct TORCH_API special_shifted_chebyshev_polynomial_v_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_shifted_chebyshev_polynomial_v")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_shifted_chebyshev_polynomial_v.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & x, const at::Tensor & n, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Tensor & n, at::Tensor & out);
+};
+
+struct TORCH_API special_shifted_chebyshev_polynomial_v_x_scalar_out {
+  using schema = at::Tensor & (const at::Scalar &, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_shifted_chebyshev_polynomial_v")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "x_scalar_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_shifted_chebyshev_polynomial_v.x_scalar_out(Scalar x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Scalar & x, const at::Tensor & n, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Scalar & x, const at::Tensor & n, at::Tensor & out);
+};
+
+struct TORCH_API special_shifted_chebyshev_polynomial_v_n_scalar_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Scalar &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::special_shifted_chebyshev_polynomial_v")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "n_scalar_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "special_shifted_chebyshev_polynomial_v.n_scalar_out(Tensor x, Scalar n, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & x, const at::Scalar & n, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & x, const at::Scalar & n, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sym_storage_offset_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sym_storage_offset_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..56bb86216b37b67225000ae99e652919f75b38bf
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sym_storage_offset_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API c10::SymInt sym_storage_offset(const at::Tensor & self);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/to_padded_tensor_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/to_padded_tensor_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a3092ae8356d2432db503e2f00d9602db117113
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/to_padded_tensor_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor & to_padded_tensor_out_symint(const at::Tensor & self, double padding, at::OptionalSymIntArrayRef output_size, at::Tensor & out);
+TORCH_API at::Tensor NestedTensor_to_padded_tensor_generic(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size=c10::nullopt);
+TORCH_API at::Tensor NestedTensor_to_padded_tensor_cuda(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size=c10::nullopt);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unbind_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unbind_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8df8b56215729c8c08fb798eb5d8ec0a36d1e72
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unbind_native.h
@@ -0,0 +1,23 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API ::std::vector<at::Tensor> unbind(const at::Tensor & self, int64_t dim=0);
+TORCH_API ::std::vector<at::Tensor> NestedTensor_unbind(const at::Tensor & self, int64_t dim=0);
+TORCH_API ::std::vector<at::Tensor> unbind(const at::Tensor & self, at::Dimname dim);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/uniform.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/uniform.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d44ffd4e3f77ed411ceb4a134e6431e118893eb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/uniform.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/uniform_ops.h>
+
+namespace at {
+
+
+// aten::uniform.out(Tensor self, float from=0, float to=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & uniform_out(at::Tensor & out, const at::Tensor & self, double from=0, double to=1, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::uniform_out::call(self, from, to, generator, out);
+}
+// aten::uniform.out(Tensor self, float from=0, float to=1, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & uniform_outf(const at::Tensor & self, double from, double to, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::uniform_out::call(self, from, to, generator, out);
+}
+
+// aten::uniform(Tensor self, float from=0, float to=1, *, Generator? generator=None) -> Tensor
+inline at::Tensor uniform(const at::Tensor & self, double from=0, double to=1, c10::optional<at::Generator> generator=c10::nullopt) {
+    return at::_ops::uniform::call(self, from, to, generator);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/var_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/var_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..020cf2d82415b1e803515e6b04855878fbf60833
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/var_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor var(const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false);
+TORCH_API at::Tensor & var_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, const c10::optional<at::Scalar> & correction=c10::nullopt, bool keepdim=false);
+TORCH_API at::Tensor & var_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, const c10::optional<at::Scalar> & correction, bool keepdim, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/vsplit_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/vsplit_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d57f3af727f14532725f0d3f9031ecf04a16bf1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/vsplit_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API vsplit_int {
+  using schema = ::std::vector<at::Tensor> (const at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::vsplit")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "int")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "vsplit.int(Tensor(a -> *) self, int sections) -> Tensor(a)[]")
+  static ::std::vector<at::Tensor> call(const at::Tensor & self, int64_t sections);
+  static ::std::vector<at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t sections);
+};
+
+struct TORCH_API vsplit_array {
+  using schema = ::std::vector<at::Tensor> (const at::Tensor &, at::IntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::vsplit")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "array")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "vsplit.array(Tensor(a -> *) self, int[] indices) -> Tensor(a)[]")
+  static ::std::vector<at::Tensor> call(const at::Tensor & self, at::IntArrayRef indices);
+  static ::std::vector<at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef indices);
+};
+
+}} // namespace at::_ops