diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_compute_linear_combination_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_compute_linear_combination_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2fa31ea5ac8f96aab8b02629c6dbf399e5f1fc94
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_compute_linear_combination_cpu_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor _compute_linear_combination(const at::Tensor & input, const at::Tensor & coefficients);
+TORCH_API at::Tensor & _compute_linear_combination_out(at::Tensor & out, const at::Tensor & input, const at::Tensor & coefficients);
+TORCH_API at::Tensor & _compute_linear_combination_outf(const at::Tensor & input, const at::Tensor & coefficients, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_efficient_attention_forward.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_efficient_attention_forward.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d03f87bbf52c896742e84bf54d14561a842876f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_efficient_attention_forward.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_efficient_attention_forward_ops.h>
+
+namespace at {
+
+
+// aten::_efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,c10::SymInt,c10::SymInt> _efficient_attention_forward(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & cu_seqlens_q, const c10::optional<at::Tensor> & cu_seqlens_k, c10::optional<int64_t> max_seqlen_q, c10::optional<int64_t> max_seqlen_k, double dropout_p, int64_t custom_mask_type, bool compute_log_sumexp=false, c10::optional<double> scale=c10::nullopt, const c10::optional<at::Tensor> & causal_diagonal={}, const c10::optional<at::Tensor> & seqlen_k={}) {
+    return at::_ops::_efficient_attention_forward::call(query, key, value, bias, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, custom_mask_type, compute_log_sumexp, scale, causal_diagonal, seqlen_k);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_sparse_backward_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_sparse_backward_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..03ecf0db2b7c794b937b1b77c136935fd6e31bbf
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_embedding_bag_sparse_backward_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor _embedding_bag_sparse_backward(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, int64_t num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
+TORCH_API at::Tensor _embedding_bag_sparse_backward_symint(const at::Tensor & grad, const at::Tensor & indices, const at::Tensor & offsets, const at::Tensor & offset2bag, const at::Tensor & bag_size, c10::SymInt num_weights, bool scale_grad_by_freq, int64_t mode, const c10::optional<at::Tensor> & per_sample_weights, int64_t padding_idx=-1);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fill_mem_eff_dropout_mask.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fill_mem_eff_dropout_mask.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4fa85ba80c31284c29de21d0beb13197929f4f4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fill_mem_eff_dropout_mask.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_fill_mem_eff_dropout_mask_ops.h>
+
+namespace at {
+
+
+// aten::_fill_mem_eff_dropout_mask_(Tensor(a!) self, float dropout_p, int seed, int offset) -> Tensor(a!)
+inline at::Tensor & _fill_mem_eff_dropout_mask_(at::Tensor & self, double dropout_p, int64_t seed, int64_t offset) {
+    return at::_ops::_fill_mem_eff_dropout_mask_::call(self, dropout_p, seed, offset);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_asin_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_asin_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..75f6798daa999c62a366b0c6077223b216c65a74
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_asin_cuda_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::vector<at::Tensor> _foreach_asin(at::TensorList self);
+TORCH_API void _foreach_asin_(at::TensorList self);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_expm1.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_expm1.h
new file mode 100644
index 0000000000000000000000000000000000000000..b723a48eecc45cc60f64ffe50a763374a52ac971
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_expm1.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_foreach_expm1_ops.h>
+
+namespace at {
+
+
+// aten::_foreach_expm1(Tensor[] self) -> Tensor[]
+inline ::std::vector<at::Tensor> _foreach_expm1(at::TensorList self) {
+    return at::_ops::_foreach_expm1::call(self);
+}
+
+// aten::_foreach_expm1_(Tensor(a!)[] self) -> ()
+inline void _foreach_expm1_(at::TensorList self) {
+    return at::_ops::_foreach_expm1_::call(self);
+}
+
+// aten::_foreach_expm1.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_expm1_out(at::TensorList out, at::TensorList self) {
+    return at::_ops::_foreach_expm1_out::call(self, out);
+}
+// aten::_foreach_expm1.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_expm1_outf(at::TensorList self, at::TensorList out) {
+    return at::_ops::_foreach_expm1_out::call(self, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_frac.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_frac.h
new file mode 100644
index 0000000000000000000000000000000000000000..1419069fce449f2dc8ed797531aef52421524572
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_frac.h
@@ -0,0 +1,44 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_foreach_frac_ops.h>
+
+namespace at {
+
+
+// aten::_foreach_frac(Tensor[] self) -> Tensor[]
+inline ::std::vector<at::Tensor> _foreach_frac(at::TensorList self) {
+    return at::_ops::_foreach_frac::call(self);
+}
+
+// aten::_foreach_frac_(Tensor(a!)[] self) -> ()
+inline void _foreach_frac_(at::TensorList self) {
+    return at::_ops::_foreach_frac_::call(self);
+}
+
+// aten::_foreach_frac.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_frac_out(at::TensorList out, at::TensorList self) {
+    return at::_ops::_foreach_frac_out::call(self, out);
+}
+// aten::_foreach_frac.out(Tensor[] self, *, Tensor(a!)[] out) -> ()
+inline void _foreach_frac_outf(at::TensorList self, at::TensorList out) {
+    return at::_ops::_foreach_frac_out::call(self, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_lazy_clone_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_lazy_clone_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed6be2b9c723b0f88641b8f7012e92e58a9a2b66
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_lazy_clone_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor _lazy_clone(const at::Tensor & self);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_local_scalar_dense_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_local_scalar_dense_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9a174c3b81808c1fc694527fcc85a3a229334c9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_local_scalar_dense_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Scalar _local_scalar_dense_cpu(const at::Tensor & self);
+TORCH_API at::Scalar _local_scalar_dense_cuda(const at::Tensor & self);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_pin_memory.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_pin_memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3c1602631c262bf471ca47783b463eb9db9fea5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_pin_memory.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_pin_memory_ops.h>
+
+namespace at {
+
+
+// aten::_pin_memory(Tensor self, Device? device=None) -> Tensor
+inline at::Tensor _pin_memory(const at::Tensor & self, c10::optional<at::Device> device=c10::nullopt) {
+    return at::_ops::_pin_memory::call(self, device);
+}
+
+// aten::_pin_memory.out(Tensor self, Device? device=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _pin_memory_out(at::Tensor & out, const at::Tensor & self, c10::optional<at::Device> device=c10::nullopt) {
+    return at::_ops::_pin_memory_out::call(self, device, out);
+}
+// aten::_pin_memory.out(Tensor self, Device? device=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _pin_memory_outf(const at::Tensor & self, c10::optional<at::Device> device, at::Tensor & out) {
+    return at::_ops::_pin_memory_out::call(self, device, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c717b33ae016d389af89a194db7c6e686c929aa
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_compressed_tensor_unsafe_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _sparse_compressed_tensor_unsafe {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, c10::SymIntArrayRef, c10::optional<at::ScalarType>, c10::optional<at::Layout>, c10::optional<at::Device>, c10::optional<bool>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_compressed_tensor_unsafe")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_compressed_tensor_unsafe(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & compressed_indices, const at::Tensor & plain_indices, const at::Tensor & values, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_unsafe_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_unsafe_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a18517b2ea77a857284aa80c2635fc9185ba855
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_coo_tensor_unsafe_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor _sparse_coo_tensor_unsafe(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, at::TensorOptions options={}, c10::optional<bool> is_coalesced=c10::nullopt);
+TORCH_API at::Tensor _sparse_coo_tensor_unsafe(const at::Tensor & indices, const at::Tensor & values, at::IntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<bool> is_coalesced);
+TORCH_API at::Tensor _sparse_coo_tensor_unsafe_symint(const at::Tensor & indices, const at::Tensor & values, c10::SymIntArrayRef size, at::TensorOptions options={}, c10::optional<bool> is_coalesced=c10::nullopt);
+TORCH_API at::Tensor _sparse_coo_tensor_unsafe_symint(const at::Tensor & indices, const at::Tensor & values, c10::SymIntArrayRef size, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<bool> is_coalesced);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..f901a30fc1874fd5fcf17851fbc844fa630bb605
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API _test_autograd_multiple_dispatch_view {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_test_autograd_multiple_dispatch_view")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_test_autograd_multiple_dispatch_view(Tensor(a) self) -> Tensor(a)")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_optional_intlist.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_optional_intlist.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c4b552819269f0f43fc7ecc3787c2c1ebad5f49
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_optional_intlist.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_test_optional_intlist_ops.h>
+
+namespace at {
+
+
+// aten::_test_optional_intlist(Tensor values, int[]? addends) -> Tensor
+inline at::Tensor _test_optional_intlist(const at::Tensor & values, at::OptionalIntArrayRef addends) {
+    return at::_ops::_test_optional_intlist::call(values, addends);
+}
+
+// aten::_test_optional_intlist.out(Tensor values, int[]? addends, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _test_optional_intlist_out(at::Tensor & out, const at::Tensor & values, at::OptionalIntArrayRef addends) {
+    return at::_ops::_test_optional_intlist_out::call(values, addends, out);
+}
+// aten::_test_optional_intlist.out(Tensor values, int[]? addends, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _test_optional_intlist_outf(const at::Tensor & values, at::OptionalIntArrayRef addends, at::Tensor & out) {
+    return at::_ops::_test_optional_intlist_out::call(values, addends, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1955fcea4b6bcdba2301e7fa1f689e2fd6b55fa7
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_thnn_fused_gru_cell_cuda_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> _thnn_fused_gru_cell(const at::Tensor & input_gates, const at::Tensor & hidden_gates, const at::Tensor & hx, const c10::optional<at::Tensor> & input_bias={}, const c10::optional<at::Tensor> & hidden_bias={});
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_unsafe_index_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_unsafe_index_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..92864dba090520827771b7a512a607d6f5fc3afd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_unsafe_index_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor _unsafe_index(const at::Tensor & self, const c10::List<c10::optional<at::Tensor>> & indices);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e29f62e42bfd3a2cc7421ffbad6474a8c8efcf4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_bilinear2d_aa_backward.h
@@ -0,0 +1,91 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_upsample_bilinear2d_aa_backward_ops.h>
+
+namespace at {
+
+
+// aten::_upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & _upsample_bilinear2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & _upsample_bilinear2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
+  }
+}
+
+// aten::_upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & _upsample_bilinear2d_aa_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+    return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & _upsample_bilinear2d_aa_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+    return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w, grad_input);
+  }
+}
+
+// aten::_upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & _upsample_bilinear2d_aa_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & _upsample_bilinear2d_aa_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
+  }
+}
+
+// aten::_upsample_bilinear2d_aa_backward.grad_input(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None, *, Tensor(a!) grad_input) -> Tensor(a!)
+inline at::Tensor & _upsample_bilinear2d_aa_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+    return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & _upsample_bilinear2d_aa_backward_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & grad_input) {
+    return at::_ops::_upsample_bilinear2d_aa_backward_grad_input::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w, grad_input);
+  }
+}
+
+// aten::_upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+inline at::Tensor _upsample_bilinear2d_aa_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor _upsample_bilinear2d_aa_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa_backward::call(grad_output, c10::fromIntArrayRefSlow(output_size), c10::fromIntArrayRefSlow(input_size), align_corners, scales_h, scales_w);
+  }
+}
+
+// aten::_upsample_bilinear2d_aa_backward(Tensor grad_output, SymInt[2] output_size, SymInt[4] input_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
+inline at::Tensor _upsample_bilinear2d_aa_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa_backward::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor _upsample_bilinear2d_aa_backward(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_bilinear2d_aa_backward::call(grad_output, output_size, input_size, align_corners, scales_h, scales_w);
+  }
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..148b4436fb0486cf7f166717b86073a50450d342
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_nearest_exact3d.h
@@ -0,0 +1,113 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/_upsample_nearest_exact3d_ops.h>
+
+namespace at {
+
+
+// aten::_upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+inline at::Tensor _upsample_nearest_exact3d(const at::Tensor & input, at::OptionalIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+    return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, scale_factors);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor _upsample_nearest_exact3d(const at::Tensor & input, at::OptionalIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+    return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size.has_value() ? c10::make_optional(c10::fromIntArrayRefSlow(*output_size)) : c10::nullopt, scale_factors);
+  }
+}
+
+// aten::_upsample_nearest_exact3d.vec(Tensor input, SymInt[]? output_size, float[]? scale_factors) -> Tensor
+inline at::Tensor _upsample_nearest_exact3d_symint(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+    return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size, scale_factors);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor _upsample_nearest_exact3d(const at::Tensor & input, at::OptionalSymIntArrayRef output_size, c10::optional<at::ArrayRef<double>> scale_factors) {
+    return at::_ops::_upsample_nearest_exact3d_vec::call(input, output_size, scale_factors);
+  }
+}
+
+// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out);
+  }
+}
+
+// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w, out);
+  }
+}
+
+// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _upsample_nearest_exact3d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & _upsample_nearest_exact3d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out);
+  }
+}
+
+// aten::_upsample_nearest_exact3d.out(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _upsample_nearest_exact3d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & _upsample_nearest_exact3d_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w, at::Tensor & out) {
+    return at::_ops::_upsample_nearest_exact3d_out::call(self, output_size, scales_d, scales_h, scales_w, out);
+  }
+}
+
+// aten::_upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+inline at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, at::IntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d::call(self, c10::fromIntArrayRefSlow(output_size), scales_d, scales_h, scales_w);
+  }
+}
+
+// aten::_upsample_nearest_exact3d(Tensor self, SymInt[3] output_size, float? scales_d=None, float? scales_h=None, float? scales_w=None) -> Tensor
+inline at::Tensor _upsample_nearest_exact3d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d::call(self, output_size, scales_d, scales_h, scales_w);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor _upsample_nearest_exact3d(const at::Tensor & self, c10::SymIntArrayRef output_size, c10::optional<double> scales_d=c10::nullopt, c10::optional<double> scales_h=c10::nullopt, c10::optional<double> scales_w=c10::nullopt) {
+    return at::_ops::_upsample_nearest_exact3d::call(self, output_size, scales_d, scales_h, scales_w);
+  }
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/align_tensors_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/align_tensors_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffce8f8fa2733cd380611b0fe3d5333d61e90f0e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/align_tensors_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API ::std::vector<at::Tensor> align_tensors(at::TensorList tensors);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/any_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/any_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..de16dd8897a2eebbf5532d8acb9ae214b22370d0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/any_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor any(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false);
+TORCH_API at::Tensor & any_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false);
+TORCH_API at::Tensor & any_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_right_shift.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_right_shift.h
new file mode 100644
index 0000000000000000000000000000000000000000..396ddf7dbd77d3729204b0ae63297fc14be32f08
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/bitwise_right_shift.h
@@ -0,0 +1,67 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/bitwise_right_shift_ops.h>
+
+namespace at {
+
+
+// aten::bitwise_right_shift.Tensor(Tensor self, Tensor other) -> Tensor
+inline at::Tensor bitwise_right_shift(const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::bitwise_right_shift_Tensor::call(self, other);
+}
+
+// aten::bitwise_right_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_right_shift_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) {
+    return at::_ops::bitwise_right_shift_Tensor_out::call(self, other, out);
+}
+// aten::bitwise_right_shift.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_right_shift_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) {
+    return at::_ops::bitwise_right_shift_Tensor_out::call(self, other, out);
+}
+
+// aten::bitwise_right_shift.Tensor_Scalar(Tensor self, Scalar other) -> Tensor
+inline at::Tensor bitwise_right_shift(const at::Tensor & self, const at::Scalar & other) {
+    return at::_ops::bitwise_right_shift_Tensor_Scalar::call(self, other);
+}
+
+// aten::bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_right_shift_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other) {
+    return at::_ops::bitwise_right_shift_Tensor_Scalar_out::call(self, other, out);
+}
+// aten::bitwise_right_shift.Tensor_Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_right_shift_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out) {
+    return at::_ops::bitwise_right_shift_Tensor_Scalar_out::call(self, other, out);
+}
+
+// aten::bitwise_right_shift.Scalar_Tensor(Scalar self, Tensor other) -> Tensor
+inline at::Tensor bitwise_right_shift(const at::Scalar & self, const at::Tensor & other) {
+    return at::_ops::bitwise_right_shift_Scalar_Tensor::call(self, other);
+}
+
+// aten::bitwise_right_shift.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_right_shift_out(at::Tensor & out, const at::Scalar & self, const at::Tensor & other) {
+    return at::_ops::bitwise_right_shift_Scalar_Tensor_out::call(self, other, out);
+}
+// aten::bitwise_right_shift.Scalar_Tensor_out(Scalar self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & bitwise_right_shift_outf(const at::Scalar & self, const at::Tensor & other, at::Tensor & out) {
+    return at::_ops::bitwise_right_shift_Scalar_Tensor_out::call(self, other, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/crow_indices_copy.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/crow_indices_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6ad76ecb1bb264b0501b533921867b25e6402e1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/crow_indices_copy.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/crow_indices_copy_ops.h>
+
+namespace at {
+
+
+// aten::crow_indices_copy(Tensor self) -> Tensor
+inline at::Tensor crow_indices_copy(const at::Tensor & self) {
+    return at::_ops::crow_indices_copy::call(self);
+}
+
+// aten::crow_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & crow_indices_copy_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::crow_indices_copy_out::call(self, out);
+}
+// aten::crow_indices_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & crow_indices_copy_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::crow_indices_copy_out::call(self, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_backward_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..b38b193442fe81f2e610a605ac0dc1a6ff680ef5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_backward_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API cudnn_grid_sampler_backward {
+  using schema = ::std::tuple<at::Tensor,at::Tensor> (const at::Tensor &, const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::cudnn_grid_sampler_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "cudnn_grid_sampler_backward(Tensor self, Tensor grid, Tensor grad_output) -> (Tensor grad_self, Tensor grad_grid)")
+  static ::std::tuple<at::Tensor,at::Tensor> call(const at::Tensor & self, const at::Tensor & grid, const at::Tensor & grad_output);
+  static ::std::tuple<at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grid, const at::Tensor & grad_output);
+};
+
+struct TORCH_API cudnn_grid_sampler_backward_out {
+  using schema = ::std::tuple<at::Tensor &,at::Tensor &> (const at::Tensor &, const at::Tensor &, const at::Tensor &, at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::cudnn_grid_sampler_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "cudnn_grid_sampler_backward.out(Tensor self, Tensor grid, Tensor grad_output, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))")
+  static ::std::tuple<at::Tensor &,at::Tensor &> call(const at::Tensor & self, const at::Tensor & grid, const at::Tensor & grad_output, at::Tensor & out0, at::Tensor & out1);
+  static ::std::tuple<at::Tensor &,at::Tensor &> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & grid, const at::Tensor & grad_output, at::Tensor & out0, at::Tensor & out1);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diagflat.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diagflat.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5c1b570f012a0278738dcf3ed16670c07768ccc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diagflat.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/diagflat_ops.h>
+
+namespace at {
+
+
+// aten::diagflat(Tensor self, int offset=0) -> Tensor
+inline at::Tensor diagflat(const at::Tensor & self, int64_t offset=0) {
+    return at::_ops::diagflat::call(self, offset);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diff_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diff_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..5de0601dbddaf1f443cc2aa700a6c0efa6065b02
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diff_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API diff {
+  using schema = at::Tensor (const at::Tensor &, int64_t, int64_t, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::diff")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "diff(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, int64_t n, int64_t dim, const c10::optional<at::Tensor> & prepend, const c10::optional<at::Tensor> & append);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t n, int64_t dim, const c10::optional<at::Tensor> & prepend, const c10::optional<at::Tensor> & append);
+};
+
+struct TORCH_API diff_out {
+  using schema = at::Tensor & (const at::Tensor &, int64_t, int64_t, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::diff")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "diff.out(Tensor self, int n=1, int dim=-1, Tensor? prepend=None, Tensor? append=None, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, int64_t n, int64_t dim, const c10::optional<at::Tensor> & prepend, const c10::optional<at::Tensor> & append, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t n, int64_t dim, const c10::optional<at::Tensor> & prepend, const c10::optional<at::Tensor> & append, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/erfinv_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/erfinv_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fd889fd414428b99f57b4d39a649bc6fc760e8c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/erfinv_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API erfinv {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::erfinv")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "erfinv(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API erfinv_ {
+  using schema = at::Tensor & (at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::erfinv_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "erfinv_(Tensor(a!) self) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
+};
+
+struct TORCH_API erfinv_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::erfinv")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "erfinv.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/expand_copy_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/expand_copy_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f8a272e4747a3ebcf9c90b41fbde716ae8ddad9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/expand_copy_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & expand_copy_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, bool implicit=false);
+TORCH_API at::Tensor & expand_copy_outf(const at::Tensor & self, at::IntArrayRef size, bool implicit, at::Tensor & out);
+TORCH_API at::Tensor & expand_copy_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, bool implicit=false);
+TORCH_API at::Tensor & expand_copy_symint_outf(const at::Tensor & self, c10::SymIntArrayRef size, bool implicit, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/flip_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/flip_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6b4164348e2f9dbbe646eea0abb0d17724e657a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/flip_cpu_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor flip(const at::Tensor & self, at::IntArrayRef dims);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fmin_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fmin_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..eda5c345f10a9c5bb46a14776d6dd40805928803
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fmin_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor fmin(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & fmin_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & fmin_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/frac_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/frac_meta_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..48ceec3d83c5563c6075c3c55e913287a091e9cf
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/frac_meta_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace meta {
+
+TORCH_API at::Tensor frac(const at::Tensor & self);
+TORCH_API at::Tensor & frac_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & frac_outf(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor & frac_(at::Tensor & self);
+
+} // namespace meta
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/full_like.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/full_like.h
new file mode 100644
index 0000000000000000000000000000000000000000..dae1cf318485faa543fc2f2ace30f1c27b2945d1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/full_like.h
@@ -0,0 +1,43 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/full_like_ops.h>
+
+namespace at {
+
+
+// aten::full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+inline at::Tensor full_like(const at::Tensor & self, const at::Scalar & fill_value, at::TensorOptions options={}, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+    return at::_ops::full_like::call(self, fill_value, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format));
+}
+// aten::full_like(Tensor self, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=None) -> Tensor
+inline at::Tensor full_like(const at::Tensor & self, const at::Scalar & fill_value, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, c10::optional<at::MemoryFormat> memory_format) {
+    return at::_ops::full_like::call(self, fill_value, dtype, layout, device, pin_memory, memory_format);
+}
+
+// aten::full_like.out(Tensor self, Scalar fill_value, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & full_like_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & fill_value, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+    return at::_ops::full_like_out::call(self, fill_value, memory_format, out);
+}
+// aten::full_like.out(Tensor self, Scalar fill_value, *, MemoryFormat? memory_format=None, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & full_like_outf(const at::Tensor & self, const at::Scalar & fill_value, c10::optional<at::MemoryFormat> memory_format, at::Tensor & out) {
+    return at::_ops::full_like_out::call(self, fill_value, memory_format, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gt_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gt_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d009f5fd27a763e9d71ff60753ffad6bc0cef92
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gt_meta.h
@@ -0,0 +1,32 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_gt_Scalar : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & self, const at::Scalar & other);
+};
+struct TORCH_API structured_gt_Tensor : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & self, const at::Tensor & other);
+};
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/huber_loss_backward_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/huber_loss_backward_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0815cd96efab939c37dd08122e79caa8f282ec1f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/huber_loss_backward_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor huber_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, int64_t reduction, double delta);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_copy_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_copy_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5e3f18a2b5f64b63dc3f2b238d883fb22859d82
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_copy_meta.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_index_copy : public at::impl::MetaBase {
+    
+                template <bool DIM = false>
+                struct TORCH_API precompute_out {
+                    
+                    precompute_out<true> set_dim(int64_t value) {
+                        static_assert(DIM == false, "dim already set");
+                        precompute_out<true> ret;
+ret.dim = value;
+return ret;
+                    }
+                
+                    int64_t dim;
+            };
+    using meta_return_ty = precompute_out <true>;
+    meta_return_ty meta(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source);
+};
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_reduce_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_reduce_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f0b9444b30a113e3d8df143d751a21797d6283b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_reduce_cuda_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor index_reduce(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, c10::string_view reduce, bool include_self=true);
+TORCH_API at::Tensor & index_reduce_out(at::Tensor & out, const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, c10::string_view reduce, bool include_self=true);
+TORCH_API at::Tensor & index_reduce_outf(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, c10::string_view reduce, bool include_self, at::Tensor & out);
+TORCH_API at::Tensor & index_reduce_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & source, c10::string_view reduce, bool include_self=true);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/l1_loss_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/l1_loss_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffca09db099a37a84a82c0db3a16d3201fcf7d3b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/l1_loss_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor l1_loss(const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_cholesky_ex.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_cholesky_ex.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ea30657402a13f9ed574062fc3fb1df01494b16
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_cholesky_ex.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/linalg_cholesky_ex_ops.h>
+
+namespace at {
+
+
+// aten::linalg_cholesky_ex(Tensor self, *, bool upper=False, bool check_errors=False) -> (Tensor L, Tensor info)
+inline ::std::tuple<at::Tensor,at::Tensor> linalg_cholesky_ex(const at::Tensor & self, bool upper=false, bool check_errors=false) {
+    return at::_ops::linalg_cholesky_ex::call(self, upper, check_errors);
+}
+
+// aten::linalg_cholesky_ex.L(Tensor self, *, bool upper=False, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info)
+inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_cholesky_ex_out(at::Tensor & L, at::Tensor & info, const at::Tensor & self, bool upper=false, bool check_errors=false) {
+    return at::_ops::linalg_cholesky_ex_L::call(self, upper, check_errors, L, info);
+}
+// aten::linalg_cholesky_ex.L(Tensor self, *, bool upper=False, bool check_errors=False, Tensor(a!) L, Tensor(b!) info) -> (Tensor(a!) L, Tensor(b!) info)
+inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_cholesky_ex_outf(const at::Tensor & self, bool upper, bool check_errors, at::Tensor & L, at::Tensor & info) {
+    return at::_ops::linalg_cholesky_ex_L::call(self, upper, check_errors, L, info);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_solve_ex.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_solve_ex.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe0750d5a5d0085fd18b99e94aa2a3b1ebe902dd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_solve_ex.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/linalg_solve_ex_ops.h>
+
+namespace at {
+
+
+// aten::linalg_solve_ex(Tensor A, Tensor B, *, bool left=True, bool check_errors=False) -> (Tensor result, Tensor info)
+inline ::std::tuple<at::Tensor,at::Tensor> linalg_solve_ex(const at::Tensor & A, const at::Tensor & B, bool left=true, bool check_errors=false) {
+    return at::_ops::linalg_solve_ex::call(A, B, left, check_errors);
+}
+
+// aten::linalg_solve_ex.out(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) info) -> (Tensor(a!) result, Tensor(b!) info)
+inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_solve_ex_out(at::Tensor & result, at::Tensor & info, const at::Tensor & A, const at::Tensor & B, bool left=true, bool check_errors=false) {
+    return at::_ops::linalg_solve_ex_out::call(A, B, left, check_errors, result, info);
+}
+// aten::linalg_solve_ex.out(Tensor A, Tensor B, *, bool left=True, bool check_errors=False, Tensor(a!) result, Tensor(b!) info) -> (Tensor(a!) result, Tensor(b!) info)
+inline ::std::tuple<at::Tensor &,at::Tensor &> linalg_solve_ex_outf(const at::Tensor & A, const at::Tensor & B, bool left, bool check_errors, at::Tensor & result, at::Tensor & info) {
+    return at::_ops::linalg_solve_ex_out::call(A, B, left, check_errors, result, info);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linear_backward_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linear_backward_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..120c9e4f457c5d5189b7ef1928919af0cd839897
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linear_backward_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> linear_backward_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,3> output_mask);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> linear_backward_outf(const at::Tensor & self, const at::Tensor & grad_output, const at::Tensor & weight, ::std::array<bool,3> output_mask, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/margin_ranking_loss_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/margin_ranking_loss_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fa15f464916f595e51672dae099617e8501b511
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/margin_ranking_loss_ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API margin_ranking_loss {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Tensor &, double, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::margin_ranking_loss")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor")
+  static at::Tensor call(const at::Tensor & input1, const at::Tensor & input2, const at::Tensor & target, double margin, int64_t reduction);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input1, const at::Tensor & input2, const at::Tensor & target, double margin, int64_t reduction);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_select_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_select_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..259a5658f63a1fbb39c0a4964d1bb41e12907413
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_select_native.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor masked_select_cpu(const at::Tensor & self, const at::Tensor & mask);
+TORCH_API at::Tensor & masked_select_out_cpu(const at::Tensor & self, const at::Tensor & mask, at::Tensor & out);
+TORCH_API at::Tensor masked_select_cuda(const at::Tensor & self, const at::Tensor & mask);
+TORCH_API at::Tensor & masked_select_out_cuda(const at::Tensor & self, const at::Tensor & mask, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/matrix_exp_backward.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/matrix_exp_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..d24c83f41e4852dad221af8e53e96b9598368a13
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/matrix_exp_backward.h
@@ -0,0 +1,30 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/matrix_exp_backward_ops.h>
+
+namespace at {
+
+
+// aten::matrix_exp_backward(Tensor self, Tensor grad) -> Tensor
+inline at::Tensor matrix_exp_backward(const at::Tensor & self, const at::Tensor & grad) {
+    return at::_ops::matrix_exp_backward::call(self, grad);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool2d_backward_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool2d_backward_compositeexplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc219d66a9ccc27ff012654661a59012a6091f80
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool2d_backward_compositeexplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautograd {
+
+TORCH_API at::Tensor & max_pool2d_backward_out(at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false);
+TORCH_API at::Tensor & max_pool2d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out);
+
+} // namespace compositeexplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_backward_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_backward_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f84c90f467598bb4a2162d8d0bfcb40d5450c49
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_backward_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API max_pool2d_with_indices_backward_grad_input {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool, const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::max_pool2d_with_indices_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "grad_input")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "max_pool2d_with_indices_backward.grad_input(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices, *, Tensor(a!) grad_input) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices, at::Tensor & grad_input);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices, at::Tensor & grad_input);
+};
+
+struct TORCH_API max_pool2d_with_indices_backward {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::max_pool2d_with_indices_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "max_pool2d_with_indices_backward(Tensor grad_output, Tensor self, int[2] kernel_size, int[2] stride, int[2] padding, int[2] dilation, bool ceil_mode, Tensor indices) -> Tensor")
+  static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, const at::Tensor & indices);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mean_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mean_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..03dc4796d5397f93dfb0b1ba5342e1354d85d3bc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mean_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor mean(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt);
+TORCH_API at::Tensor & mean_out(at::Tensor & out, const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim=false, c10::optional<at::ScalarType> dtype=c10::nullopt);
+TORCH_API at::Tensor & mean_outf(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype, at::Tensor & out);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_batch_norm_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_batch_norm_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3f12ef21fa98c5ff432e2a933d42d7a48d17939
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_batch_norm_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API miopen_batch_norm {
+  using schema = ::std::tuple<at::Tensor,at::Tensor,at::Tensor> (const at::Tensor &, const at::Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, bool, double, double);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::miopen_batch_norm")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)")
+  static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> call(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double exponential_average_factor, double epsilon);
+  static ::std::tuple<at::Tensor,at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double exponential_average_factor, double epsilon);
+};
+
+struct TORCH_API miopen_batch_norm_out {
+  using schema = ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> (const at::Tensor &, const at::Tensor &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, const c10::optional<at::Tensor> &, bool, double, double, at::Tensor &, at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::miopen_batch_norm")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "miopen_batch_norm.out(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon, *, Tensor(a!) out0, Tensor(b!) out1, Tensor(c!) out2) -> (Tensor(a!), Tensor(b!), Tensor(c!))")
+  static ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> call(const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double exponential_average_factor, double epsilon, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2);
+  static ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, const c10::optional<at::Tensor> & running_mean, const c10::optional<at::Tensor> & running_var, bool training, double exponential_average_factor, double epsilon, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mish_compositeexplicitautogradnonfunctional_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mish_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cce89804eaebd1d8307c02ea6f4caa2d2651d47b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mish_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor mish(const at::Tensor & self);
+TORCH_API at::Tensor & mish_(at::Tensor & self);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ne_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ne_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f4847a150f978cc90217b44533e962d4567a7aac
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ne_cuda_dispatch.h
@@ -0,0 +1,30 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor ne(const at::Tensor & self, const at::Scalar & other);
+TORCH_API at::Tensor & ne_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other);
+TORCH_API at::Tensor & ne_outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out);
+TORCH_API at::Tensor & ne_(at::Tensor & self, const at::Scalar & other);
+TORCH_API at::Tensor ne(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & ne_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & ne_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+TORCH_API at::Tensor & ne_(at::Tensor & self, const at::Tensor & other);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/new_empty_strided_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/new_empty_strided_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..27087b6af4671a3c4082ddb2cd7d8675c0ab27c1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/new_empty_strided_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API new_empty_strided {
+  using schema = at::Tensor (const at::Tensor &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::optional<at::ScalarType>, c10::optional<at::Layout>, c10::optional<at::Device>, c10::optional<bool>);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::new_empty_strided")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "new_empty_strided(Tensor self, SymInt[] size, SymInt[] stride, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory);
+};
+
+struct TORCH_API new_empty_strided_out {
+  using schema = at::Tensor & (const at::Tensor &, c10::SymIntArrayRef, c10::SymIntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::new_empty_strided")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "new_empty_strided.out(Tensor self, SymInt[] size, SymInt[] stride, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/quantized_max_pool1d_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/quantized_max_pool1d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ffab7d04ec2992d24b7420e5b242b94a520833c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/quantized_max_pool1d_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API quantized_max_pool1d {
+  using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::quantized_max_pool1d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "quantized_max_pool1d(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode);
+};
+
+struct TORCH_API quantized_max_pool1d_out {
+  using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::quantized_max_pool1d")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "quantized_max_pool1d.out(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/randperm.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/randperm.h
new file mode 100644
index 0000000000000000000000000000000000000000..865e46ccad3e630fee02bb7e863b656d377dd861
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/randperm.h
@@ -0,0 +1,201 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/randperm_ops.h>
+
+namespace at {
+
+
+// aten::randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor randperm(int64_t n, at::TensorOptions options=at::kLong) {
+    return at::_ops::randperm::call(n, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor randperm(int64_t n, at::TensorOptions options=at::kLong) {
+    return at::_ops::randperm::call(n, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+  }
+}
+
+// aten::randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor randperm(int64_t n, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    return at::_ops::randperm::call(n, dtype, layout, device, pin_memory);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor randperm(int64_t n, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    return at::_ops::randperm::call(n, dtype, layout, device, pin_memory);
+  }
+}
+
+// aten::randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor randperm_symint(c10::SymInt n, at::TensorOptions options=at::kLong) {
+    return at::_ops::randperm::call(n, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor randperm(c10::SymInt n, at::TensorOptions options=at::kLong) {
+    return at::_ops::randperm::call(n, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+  }
+}
+
+// aten::randperm(SymInt n, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor randperm_symint(c10::SymInt n, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    return at::_ops::randperm::call(n, dtype, layout, device, pin_memory);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor randperm(c10::SymInt n, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    return at::_ops::randperm::call(n, dtype, layout, device, pin_memory);
+  }
+}
+
+// aten::randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor randperm(int64_t n, c10::optional<at::Generator> generator, at::TensorOptions options=at::kLong) {
+    return at::_ops::randperm_generator::call(n, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor randperm(int64_t n, c10::optional<at::Generator> generator, at::TensorOptions options=at::kLong) {
+    return at::_ops::randperm_generator::call(n, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+  }
+}
+
+// aten::randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor randperm(int64_t n, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    return at::_ops::randperm_generator::call(n, generator, dtype, layout, device, pin_memory);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor randperm(int64_t n, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    return at::_ops::randperm_generator::call(n, generator, dtype, layout, device, pin_memory);
+  }
+}
+
+// aten::randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor randperm_symint(c10::SymInt n, c10::optional<at::Generator> generator, at::TensorOptions options=at::kLong) {
+    return at::_ops::randperm_generator::call(n, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor randperm(c10::SymInt n, c10::optional<at::Generator> generator, at::TensorOptions options=at::kLong) {
+    return at::_ops::randperm_generator::call(n, generator, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt());
+  }
+}
+
+// aten::randperm.generator(SymInt n, *, Generator? generator, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+inline at::Tensor randperm_symint(c10::SymInt n, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    return at::_ops::randperm_generator::call(n, generator, dtype, layout, device, pin_memory);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor randperm(c10::SymInt n, c10::optional<at::Generator> generator, c10::optional<at::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory) {
+    return at::_ops::randperm_generator::call(n, generator, dtype, layout, device, pin_memory);
+  }
+}
+
+// aten::randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & randperm_out(at::Tensor & out, int64_t n) {
+    return at::_ops::randperm_out::call(n, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & randperm_out(at::Tensor & out, int64_t n) {
+    return at::_ops::randperm_out::call(n, out);
+  }
+}
+
+// aten::randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & randperm_outf(int64_t n, at::Tensor & out) {
+    return at::_ops::randperm_out::call(n, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & randperm_outf(int64_t n, at::Tensor & out) {
+    return at::_ops::randperm_out::call(n, out);
+  }
+}
+
+// aten::randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & randperm_symint_out(at::Tensor & out, c10::SymInt n) {
+    return at::_ops::randperm_out::call(n, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & randperm_out(at::Tensor & out, c10::SymInt n) {
+    return at::_ops::randperm_out::call(n, out);
+  }
+}
+
+// aten::randperm.out(SymInt n, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & randperm_symint_outf(c10::SymInt n, at::Tensor & out) {
+    return at::_ops::randperm_out::call(n, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & randperm_outf(c10::SymInt n, at::Tensor & out) {
+    return at::_ops::randperm_out::call(n, out);
+  }
+}
+
+// aten::randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & randperm_out(at::Tensor & out, int64_t n, c10::optional<at::Generator> generator) {
+    return at::_ops::randperm_generator_out::call(n, generator, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & randperm_out(at::Tensor & out, int64_t n, c10::optional<at::Generator> generator) {
+    return at::_ops::randperm_generator_out::call(n, generator, out);
+  }
+}
+
+// aten::randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & randperm_outf(int64_t n, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::randperm_generator_out::call(n, generator, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & randperm_outf(int64_t n, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::randperm_generator_out::call(n, generator, out);
+  }
+}
+
+// aten::randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & randperm_symint_out(at::Tensor & out, c10::SymInt n, c10::optional<at::Generator> generator) {
+    return at::_ops::randperm_generator_out::call(n, generator, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & randperm_out(at::Tensor & out, c10::SymInt n, c10::optional<at::Generator> generator) {
+    return at::_ops::randperm_generator_out::call(n, generator, out);
+  }
+}
+
+// aten::randperm.generator_out(SymInt n, *, Generator? generator, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & randperm_symint_outf(c10::SymInt n, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::randperm_generator_out::call(n, generator, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & randperm_outf(c10::SymInt n, c10::optional<at::Generator> generator, at::Tensor & out) {
+    return at::_ops::randperm_generator_out::call(n, generator, out);
+  }
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4299fabddb76a284091f854ba1edc6240c61874d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/reflection_pad1d_backward_cpu_dispatch.h
@@ -0,0 +1,28 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor reflection_pad1d_backward(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding);
+TORCH_API at::Tensor reflection_pad1d_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding);
+TORCH_API at::Tensor & reflection_pad1d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding);
+TORCH_API at::Tensor & reflection_pad1d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::IntArrayRef padding, at::Tensor & grad_input);
+TORCH_API at::Tensor & reflection_pad1d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding);
+TORCH_API at::Tensor & reflection_pad1d_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & grad_input);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ca317e8cefce71bbf743637833dde0d68f5568a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/relu_cpu_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor relu(const at::Tensor & self);
+TORCH_API at::Tensor & relu_(at::Tensor & self);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d29a4bee8b4d293a1d8a6840694f3acb3368c4d1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cuda_dispatch.h
@@ -0,0 +1,27 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API at::Tensor remainder(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & remainder_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & remainder_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+TORCH_API at::Tensor & remainder_(at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor remainder(const at::Scalar & self, const at::Tensor & other);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/renorm_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/renorm_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0e1aa603be317bc5167b80d049e0eb54919f96c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/renorm_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_renorm : public at::impl::MetaBase {
+    
+    
+    void meta(const at::Tensor & self, const at::Scalar & p, int64_t dim, const at::Scalar & maxnorm);
+};
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/round_compositeexplicitautogradnonfunctional_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/round_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd43946208ce0e60a09444ea4bb78c4b3ccb21dc
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/round_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor round(const at::Tensor & self);
+TORCH_API at::Tensor & round_(at::Tensor & self);
+TORCH_API at::Tensor round(const at::Tensor & self, int64_t decimals);
+TORCH_API at::Tensor & round_(at::Tensor & self, int64_t decimals);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rrelu_with_noise_backward.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rrelu_with_noise_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d652d0c281bdafcc2758ea46c734889a9b92edb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rrelu_with_noise_backward.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/rrelu_with_noise_backward_ops.h>
+
+namespace at {
+
+
+// aten::rrelu_with_noise_backward(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result) -> Tensor
+inline at::Tensor rrelu_with_noise_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower, const at::Scalar & upper, bool training, bool self_is_result) {
+    return at::_ops::rrelu_with_noise_backward::call(grad_output, self, noise, lower, upper, training, self_is_result);
+}
+
+// aten::rrelu_with_noise_backward.out(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & rrelu_with_noise_backward_out(at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower, const at::Scalar & upper, bool training, bool self_is_result) {
+    return at::_ops::rrelu_with_noise_backward_out::call(grad_output, self, noise, lower, upper, training, self_is_result, out);
+}
+// aten::rrelu_with_noise_backward.out(Tensor grad_output, Tensor self, Tensor noise, Scalar lower, Scalar upper, bool training, bool self_is_result, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & rrelu_with_noise_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & noise, const at::Scalar & lower, const at::Scalar & upper, bool training, bool self_is_result, at::Tensor & out) {
+    return at::_ops::rrelu_with_noise_backward_out::call(grad_output, self, noise, lower, upper, training, self_is_result, out);
+}
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/scatter_add_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/scatter_add_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b4d0d1ed791f886b568f2a2fb18c53bc80af67c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/scatter_add_native.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/scatter_add_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_scatter_add : public at::meta::structured_scatter_add {
+void impl(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, const at::Tensor & out);
+};
+TORCH_API at::Tensor scatter_add(const at::Tensor & self, at::Dimname dim, const at::Tensor & index, const at::Tensor & src);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/scatter_compositeexplicitautogradnonfunctional_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/scatter_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4561c1f978ecb362ec541c70cddda43556fcfe87
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/scatter_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,30 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor scatter(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src);
+TORCH_API at::Tensor & scatter_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src);
+TORCH_API at::Tensor scatter(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value);
+TORCH_API at::Tensor & scatter_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value);
+TORCH_API at::Tensor scatter(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce);
+TORCH_API at::Tensor & scatter_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Tensor & src, c10::string_view reduce);
+TORCH_API at::Tensor scatter(const at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce);
+TORCH_API at::Tensor & scatter_(at::Tensor & self, int64_t dim, const at::Tensor & index, const at::Scalar & value, c10::string_view reduce);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sign_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sign_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..e93ccf0367d83c3eeae71196b1fa051992eba8da
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sign_ops.h
@@ -0,0 +1,50 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API sign {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::sign")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "sign(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+
+struct TORCH_API sign_ {
+  using schema = at::Tensor & (at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::sign_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "sign_(Tensor(a!) self) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
+};
+
+struct TORCH_API sign_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::sign")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "sign.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_meta.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d04ed1894b5c697d3afcd4bc5f50287075c48d1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_v_meta.h
@@ -0,0 +1,27 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_special_chebyshev_polynomial_v : public TensorIteratorBase {
+    
+    
+    void meta(const at::Tensor & x, const at::Tensor & n);
+};
+
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfinv_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfinv_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef0b62252192b07ca392d2bad99c59f33067420f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfinv_native.h
@@ -0,0 +1,22 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+
+
+namespace at {
+namespace native {
+TORCH_API at::Tensor special_erfinv(const at::Tensor & self);
+TORCH_API at::Tensor & special_erfinv_out(const at::Tensor & self, at::Tensor & out);
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_i0e_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_i0e_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..a59b536a52deff0fd31418afe5f3b647bd2b10cd
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_i0e_cpu_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor special_i0e(const at::Tensor & self);
+TORCH_API at::Tensor & special_i0e_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & special_i0e_outf(const at::Tensor & self, at::Tensor & out);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_scaled_modified_bessel_k1_compositeexplicitautogradnonfunctional_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_scaled_modified_bessel_k1_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0789c669cda9f72630a662514e2f247ad9457457
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_scaled_modified_bessel_k1_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor special_scaled_modified_bessel_k1(const at::Tensor & x);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/split_with_sizes_copy_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/split_with_sizes_copy_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5c756bb9dd3d57031787b33210ce791b98aa3ed
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/split_with_sizes_copy_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API split_with_sizes_copy {
+  using schema = ::std::vector<at::Tensor> (const at::Tensor &, c10::SymIntArrayRef, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::split_with_sizes_copy")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "split_with_sizes_copy(Tensor self, SymInt[] split_sizes, int dim=0) -> Tensor[]")
+  static ::std::vector<at::Tensor> call(const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim);
+  static ::std::vector<at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim);
+};
+
+struct TORCH_API split_with_sizes_copy_out {
+  using schema = void (const at::Tensor &, c10::SymIntArrayRef, int64_t, at::TensorList);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::split_with_sizes_copy")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "split_with_sizes_copy.out(Tensor self, SymInt[] split_sizes, int dim=0, *, Tensor(a!)[] out) -> ()")
+  static void call(const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim, at::TensorList out);
+  static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymIntArrayRef split_sizes, int64_t dim, at::TensorList out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/stft_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/stft_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..77318eb5ae414eb5a9eeedcf1c76e6f3a789004b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/stft_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor stft(const at::Tensor & self, int64_t n_fft, c10::optional<int64_t> hop_length, c10::optional<int64_t> win_length, const c10::optional<at::Tensor> & window, bool normalized, c10::optional<bool> onesided=c10::nullopt, c10::optional<bool> return_complex=c10::nullopt);
+TORCH_API at::Tensor stft(const at::Tensor & self, int64_t n_fft, c10::optional<int64_t> hop_length=c10::nullopt, c10::optional<int64_t> win_length=c10::nullopt, const c10::optional<at::Tensor> & window={}, bool center=true, c10::string_view pad_mode="reflect", bool normalized=false, c10::optional<bool> onesided=c10::nullopt, c10::optional<bool> return_complex=c10::nullopt);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sym_size_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sym_size_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..44634d72c4541d512729ac492766758ccfc3c657
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/sym_size_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API c10::SymInt sym_size(const at::Tensor & self, int64_t dim);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trapz_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trapz_compositeimplicitautograd_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4dfe5056e4d41771e59fa82ad47cdd68456127f1
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/trapz_compositeimplicitautograd_dispatch.h
@@ -0,0 +1,24 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeimplicitautograd {
+
+TORCH_API at::Tensor trapz(const at::Tensor & y, const at::Tensor & x, int64_t dim=-1);
+TORCH_API at::Tensor trapz(const at::Tensor & y, double dx=1, int64_t dim=-1);
+
+} // namespace compositeimplicitautograd
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/triangular_solve_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/triangular_solve_cuda_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf2f57766b0c156aae72d26d376d7d22733ddba8
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/triangular_solve_cuda_dispatch.h
@@ -0,0 +1,25 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cuda {
+
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> triangular_solve(const at::Tensor & self, const at::Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> triangular_solve_out(at::Tensor & X, at::Tensor & M, const at::Tensor & self, const at::Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &> triangular_solve_outf(const at::Tensor & self, const at::Tensor & A, bool upper, bool transpose, bool unitriangular, at::Tensor & X, at::Tensor & M);
+
+} // namespace cuda
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b9e1fe036a0b8c9e09aa1050bf4c192cb9c7742
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_cpu_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor tril(const at::Tensor & self, int64_t diagonal=0);
+TORCH_API at::Tensor & tril_out(at::Tensor & out, const at::Tensor & self, int64_t diagonal=0);
+TORCH_API at::Tensor & tril_outf(const at::Tensor & self, int64_t diagonal, at::Tensor & out);
+TORCH_API at::Tensor & tril_(at::Tensor & self, int64_t diagonal=0);
+
+} // namespace cpu
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_native.h
new file mode 100644
index 0000000000000000000000000000000000000000..7859e44a34ec6442314b11262b300a03f391fce8
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_native.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from NativeFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/tril_meta.h>
+
+namespace at {
+namespace native {
+struct TORCH_API structured_tril_cpu : public at::meta::structured_tril {
+void impl(const at::Tensor & self, int64_t diagonal, const at::Tensor & out);
+};
+struct TORCH_API structured_tril_cuda : public at::meta::structured_tril {
+void impl(const at::Tensor & self, int64_t diagonal, const at::Tensor & out);
+};
+} // namespace native
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unfold.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unfold.h
new file mode 100644
index 0000000000000000000000000000000000000000..811f866cd07e2fc9bc058723cd6f0971fe771893
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unfold.h
@@ -0,0 +1,26 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+
+
+
+#include <ATen/ops/unfold_ops.h>
+
+namespace at {
+
+
+
+}
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unsafe_split_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unsafe_split_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..95cd64face574b0c6e026a1b91b4ddc2a7c93554
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unsafe_split_ops.h
@@ -0,0 +1,39 @@
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API unsafe_split_Tensor {
+  using schema = ::std::vector<at::Tensor> (const at::Tensor &, c10::SymInt, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::unsafe_split")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "unsafe_split.Tensor(Tensor self, SymInt split_size, int dim=0) -> Tensor[]")
+  static ::std::vector<at::Tensor> call(const at::Tensor & self, c10::SymInt split_size, int64_t dim);
+  static ::std::vector<at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt split_size, int64_t dim);
+};
+
+struct TORCH_API unsafe_split_Tensor_out {
+  using schema = void (const at::Tensor &, c10::SymInt, int64_t, at::TensorList);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::unsafe_split")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "Tensor_out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "unsafe_split.Tensor_out(Tensor self, SymInt split_size, int dim=0, *, Tensor(a!)[] out) -> ()")
+  static void call(const at::Tensor & self, c10::SymInt split_size, int64_t dim, at::TensorList out);
+  static void redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::SymInt split_size, int64_t dim, at::TensorList out);
+};
+
+}} // namespace at::_ops
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/view_as_real_copy_compositeexplicitautogradnonfunctional_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/view_as_real_copy_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..86d644a2f214ba08720dcc0b946e9044ed2e7bd9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/view_as_real_copy_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,23 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor view_as_real_copy(const at::Tensor & self);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/xlogy_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/xlogy_cpu_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9010f9c905d49bf2d3438b5968b1a0db8b7db231
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/xlogy_cpu_dispatch.h
@@ -0,0 +1,26 @@
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace cpu {
+
+TORCH_API at::Tensor xlogy(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & xlogy_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & xlogy_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+TORCH_API at::Tensor & xlogy_(at::Tensor & self, const at::Tensor & other);
+
+} // namespace cpu
+} // namespace at