koichi12 commited on Feb 12, 2025

Commit

1635328

verified ·

1 Parent(s): 1105a93

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/bmm.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Config.h +22 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/FuncTorchTLS.h +46 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyVmapMode.h +26 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyVmapTransforms.h +183 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/OpMathType.h +69 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/RegistrationDeclarations.h +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ScalarOps.h +53 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Storage.h +2 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/WrapDimUtils.h +153 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/code_template.h +243 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_compositeexplicitautograd_dispatch.h +26 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cast_Short_native.h +21 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_ops.h +28 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_cuda_dispatch.h +23 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_flash_attention_backward_native.h +21 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_acos_native.h +25 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_linalg_slogdet_meta.h +27 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_linalg_svd_cpu_dispatch.h +25 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_logcumsumexp.h +39 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_native_multi_head_attention.h +39 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_tensor_storage_offsets_native.h +22 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_log_softmax_backward_data_native.h +23 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_native.h +28 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_nearest_exact2d_backward_meta.h +27 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_sparse_csr_tensor_args.h +30 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/align_as_ops.h +28 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/aminmax_meta.h +27 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ceil_ops.h +50 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/constant_pad_nd_compositeexplicitautograd_dispatch.h +28 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/dot_cuda_dispatch.h +23 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_hfft.h +91 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_irfft2_native.h +22 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_rfftn_native.h +22 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fix.h +44 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/float_power_compositeimplicitautograd_dispatch.h +33 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/geometric_cpu_dispatch.h +23 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/glu_backward_ops.h +39 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hardshrink_native.h +23 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hardtanh_backward_cpu_dispatch.h +25 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/isposinf_cuda_dispatch.h +25 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/le_compositeexplicitautogradnonfunctional_dispatch.h +26 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_qr_compositeexplicitautogradnonfunctional_dispatch.h +23 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_svd_native.h +22 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_vector_norm_meta.h +27 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logaddexp2_cuda_dispatch.h +25 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool1d_with_indices.h +30 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_ops.h +39 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_convolution_relu_ops.h +28 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_rnn_layer_backward_cpu_dispatch.h +23 -0

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/kernel/__pycache__/bmm.cpython-311.pyc ADDED Viewed

Binary file (5.36 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Config.h ADDED Viewed

	@@ -0,0 +1,22 @@

+#pragma once
+// Test these using #if AT_MKL_ENABLED(), not #ifdef, so that it's
+// obvious if you forgot to include Config.h
+//    c.f. https://stackoverflow.com/questions/33759787/generating-an-error-if-checked-boolean-macro-is-not-defined
+//
+// DO NOT put the macros for CUDA libraries in this file; they belong in cuda/CUDAConfig.h
+#define AT_MKLDNN_ENABLED() 1
+#define AT_MKLDNN_ACL_ENABLED() 0
+#define AT_MKL_ENABLED() 1
+#define AT_MKL_SEQUENTIAL() 0
+#define AT_POCKETFFT_ENABLED() 0
+#define AT_NNPACK_ENABLED() 1
+#define CAFFE2_STATIC_LINK_CUDA() 0
+#define AT_BUILD_WITH_BLAS() 1
+#define AT_BUILD_WITH_LAPACK() 1
+#define AT_PARALLEL_OPENMP 1
+#define AT_PARALLEL_NATIVE 0
+#define AT_PARALLEL_NATIVE_TBB 0
+#define AT_BLAS_F2C() 0
+#define AT_BLAS_USE_CBLAS_DOT() 0

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/FuncTorchTLS.h ADDED Viewed

	@@ -0,0 +1,46 @@

+#pragma once
+#include <c10/macros/Macros.h>
+#include <memory>
+namespace at::functorch {
+// NOTE [functorch TLS in pytorch/pytorch]
+//
+// functorch lives out-of-tree. However, it has some TLS that needs to be
+// propagated. The solution for that is we store a pointer to the TLS
+// inside pytorch/pytorch and extend FuncTorchTLSBase inside functorch to
+// include whatever functorch needs.
+//
+// We need to store a pointer due to the indirection:
+// inside functorch, we will create a subclass of FunctorchTLSBase called
+// FuncTorchTLSImpl that actually contains metadata, like the DynamicLayerStack.
+// FuncTorchTLSBase doesn't have any metadata because it hasn't been defined
+// yet.
+//
+// Here in pytorch/pytorch, we will pass around FuncTorchTLSBase*, but inside
+// functorch, we will assign a FuncTorchTLSImpl* to the FunctorchTLSBase*.
+// We can't directly pass around FunctorchTLSBase (without a pointer) because
+// FuncTorchTLSImpl does not fit inside a FuncTorchTLSBase by virtue of having
+// more elements.
+struct TORCH_API FuncTorchTLSBase {
+  virtual ~FuncTorchTLSBase() = default;
+  virtual std::unique_ptr<FuncTorchTLSBase> deepcopy() const = 0;
+  virtual int64_t checkSupportsSingleLevelAutogradFunction() const = 0;
+  virtual void checkSupportsCppAutogradFunction() const = 0;
+  virtual void checkSupportsInplaceRequiresGrad() const = 0;
+  virtual void checkSupportsRetainGrad() const = 0;
+};
+// returns deepcopy of the functorch tls
+TORCH_API std::unique_ptr<FuncTorchTLSBase> getCopyOfFuncTorchTLS();
+// sets the functorch tls. always does a deep copy.
+TORCH_API void setFuncTorchTLS(
+    const std::shared_ptr<const FuncTorchTLSBase>& state);
+// get a mutable reference to the functorch tls
+TORCH_API std::unique_ptr<FuncTorchTLSBase>& functorchTLSAccessor();
+} // namespace at::functorch

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyVmapMode.h ADDED Viewed

	@@ -0,0 +1,26 @@

+#pragma once
+#include <c10/core/impl/LocalDispatchKeySet.h>
+namespace at::impl {
+// VmapMode contains a thread local count of how many nested vmaps
+// we are currently inside. That number is known as the `vmap level`.
+// VmapMode is used in the implementation of the Python `torch.vmap` API.
+//
+// NOTE: this is NOT the c++ api for torch.vmap. That doesn't exist yet.
+struct TORCH_API VmapMode {
+  // Returns the vmap level, aka the count of how many nested vmaps we're in.
+  static int64_t current_vmap_level();
+  // Increment the count of nested vmaps. If this causes the vmap level to be
+  // greater than 0, then it enables DispatchKey::VmapMode on all tensors.
+  static int64_t increment_nesting();
+  // Decrements the count of nested vmaps. If this causes the vmap level to be
+  // equal to 0, then it disables DispatchKey::VmapMode on all tensors.
+  static int64_t decrement_nesting();
+};
+} // namespace at::impl

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/LegacyVmapTransforms.h ADDED Viewed

	@@ -0,0 +1,183 @@

+#pragma once
+#include <ATen/LegacyBatchedTensorImpl.h>
+#include <ATen/core/IListRef.h>
+namespace at {
+// This file contains abstractions used for transforming *logical* vmap
+// arguments into *physical* arguments. (Keep reading for definitions of these
+// terms).
+// NOTE: [Logical vs physical args]
+// Consider the following vmap.
+//   vmap(vmap(func, in_dims=(2,)), in_dims=(0,))(torch.ones(2, 3, 4))
+// This would produce a BatchedTensor wrapping a Tensor of size [2, 3, 4],
+// with batch dims 0 and 2:
+//   BatchedTensor(ones(2, 3, 4), bdims=[(lvl=1,dim=0),(lvl=2,dim=2)])
+//
+// We say the *logical* view of the tensor has size [3] -- tensors inside
+// `func` appear to have size [3].
+// However, the *physical* underlying tensor (the one passed to vmap) has size
+// [2, 3, 4].
+//
+// This notion of logical vs physical also extends to non-tensor arguments.
+// Consider the previous tensor; let's assume the user called
+// `torch.sum(tensor, dim=0)` inside of `func`. Then the logical
+// dimension they are reducing over is dim 0 but the physical dim is dim 1
+// (the first non-batch dimension)
+// Forward declared; see NOTE: [What is a VmapPhysicalView?]
+struct VmapPhysicalView;
+// Most PyTorch operators take 4 or fewer inputs.
+constexpr int64_t kVmapTransformStaticInputSize = 4;
+using VmapPhysicalViewVec =
+    SmallVector<VmapPhysicalView, kVmapTransformStaticInputSize>;
+// Pytorch generally advertises good performance for <= 5 dims.
+// (see ATen/core/DimVector.h). We add a few extra dims (~3) for vmap
+// dimensions to get 8. Adjust this number as necessary
+constexpr int64_t kVmapStaticDimVecSize = 8;
+using VmapDimVector = SmallVector<int64_t, kVmapStaticDimVecSize>;
+using VmapSymDimVector = SmallVector<c10::SymInt, kVmapStaticDimVecSize>;
+// NOTE: [What is an VmapTransform?]
+// An *VmapTransform* converts logical views of tensors to physical views.
+//
+// Batching rules use VmapTransforms to convert logical arguments to
+// physical arguments, then call one or more at:: operator that handles the
+// physical arguments, and then converts the physical result back to a logical
+// argument.
+// VmapTransform for operators that take tensors with multiple batch dims.
+// Given one or more logical views on Tensors, `logicalToPhysical`
+// permutes all of the batch dims to the front of the tensor, aligns
+// and expands the batch dims to match each other (according to their `level`),
+// and returns a VmapPhysicalView on the tensor(s).
+struct TORCH_API MultiBatchVmapTransform {
+  static VmapPhysicalView logicalToPhysical(const Tensor& logical_tensor);
+  static VmapPhysicalViewVec logicalToPhysical(ITensorListRef logical_tensors);
+};
+// VmapTransform for operators that broadcast all inputs.
+// Given some logical views on Tensors, `logicalToPhysical`:
+// - permutes all of the batch dims to the front of the tensors
+// - aligns all the batch dims to the collective levels of all of the tensors.
+//   If a tensor does not have a batch dim for a vmap level, then it receives
+//   a size-one dimension for said level.
+// - aligns the non-batch dims to have the same dimensionality, adding extra
+//   size-1 dimensions in between the batch dimensions and the non-batch
+//   dimensions so that the batch dimensions are lined up from the right.
+//
+// For example: given inputs of size (B, 2) and (B, 3, 2) where B is the batch
+// dimension, BroadcastingVmapTransform returns VmapPhysicalViews that wrap
+// tensors of size (B, 1, 2) and (B, 3, 2).
+//
+// Given inputs of size (B, 2) and (2,), BroadcastingVmapTransform returns
+// VmapPhysicalViews wrapping tensors of size (B, 2) and (1, 2). We don't
+// actually *need* to return a tensor of size (1, 2) for the second tensor
+// because the broadcasting operation takes care of that for us, but we do
+// it anyways to keep things simple.
+struct TORCH_API BroadcastingVmapTransform {
+  static VmapPhysicalViewVec logicalToPhysical(TensorList logical_tensors);
+};
+// Forward declared, if you're reading this file head to toe, don't worry about
+// it yet.
+struct VmapPhysicalToLogicalMap;
+// NOTE: [What is a VmapPhysicalView?]
+// VmapPhysicalView represents a physical view on a Tensor.
+//
+// One can use it to further convert logical dimension indices, logical shapes,
+// and more to their physical variants, or convert a new (physical) tensor into
+// a logical BatchedTensor. (TODO(rzou): some of these are not yet implemented).
+//
+// VmapPhysicalView stores a physical tensor with all of its batch dimensions at
+// the front and some levels that correspond to said batch dimensions.
+//
+// The levels bitset specifies which vmap levels correspond to the batch
+// dimensions at the front of the tensor. In particular, the number of set bits
+// corresponds to the number of batch dimensions on `tensor` and the rightmost
+// bit of `levels` specifies the maximum number of nested vmaps we are in at
+// this point in time.
+// For example, given:
+//   physical_view = VmapPhysicalView(tensor=ones(2, 3, 4, 5, 6), levels={1, 3})
+//
+// Rightmost bit of `levels` is 3 indicating the number of nested vmaps less
+// than or equal to 3.
+//   bitset: 010100
+//              ^
+//              |
+//   levels: 012345
+struct TORCH_API VmapPhysicalView {
+  VmapPhysicalView(Tensor&& tensor, std::bitset<kVmapNumLevels> levels)
+      : levels_(levels), tensor_(std::move(tensor)) {
+    TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor_));
+  }
+  Tensor& tensor() {
+    return tensor_;
+  }
+  const Tensor& tensor() const {
+    return tensor_;
+  }
+  // Maps logical dim indices to physical dim indices. Also does dim wrapping.
+  //
+  // For example, given:
+  //   physical_view = VmapPhysicalView(tensor=ones(2, 3, 4, 5), levels={1, 3})
+  //
+  // Then physical_view.getPhysicalDims({0, 1}) returns {2, 3}.
+  // This is because the size of levels tell us that the first two dimensions
+  // of `tensor_` are batch dimensions, so a logical dim of `n` is actually
+  // a physical dim of `n + 2`.
+  VmapDimVector getPhysicalDims(OptionalIntArrayRef logical_dims) const;
+  int64_t getPhysicalDim(int64_t logical_dim) const;
+  // Returns a VmapPhysicalToLogicalMap object. This can be used for
+  // mapping a physical tensor to a new logical tensor (BatchedTensor)
+  VmapPhysicalToLogicalMap getPhysicalToLogicalMap() const;
+  // Maps a logical shape to a physical shape by pre-pending the batch
+  // sizes to the logical shape.
+  VmapDimVector getPhysicalShape(IntArrayRef logical_shape) const;
+  int64_t numBatchDims() const;
+ private:
+  int64_t numLogicalDims() const;
+  std::bitset<kVmapNumLevels> levels_;
+  Tensor tensor_;
+};
+// Convenience struct used for mapping a physical tensor (a non-BatchedTensor)
+// to a logical one (BatchedTensor). It holds some levels that are used to do
+// the mapping and assumes that the batch dimensions in the physical tensor all
+// occur at the front of the tensor.
+struct TORCH_API VmapPhysicalToLogicalMap {
+  VmapPhysicalToLogicalMap(std::bitset<kVmapNumLevels> levels)
+      : levels_(levels) {}
+  // Maps a physical tensor to a new logical tensor (BatchedTensor).
+  // Assumes that all of the "batch dimensions" are at the front
+  // of the physical tensor. For example, given:
+  // - x = rank-4 Tensor with size 2, 3, 5, 7
+  // - levels = (2, 4)
+  // Returns:
+  // - BatchedTensor(x, bdims=[(dim=0,lvl=2), (dim=1, lvl=4)])
+  Tensor apply(const Tensor& physical_tensor) const;
+  // Given a vector of physical tensors,
+  // 1. maps each tensor to a new logical tensor. Assumes that all of the
+  //    "batch dimensions" are at the front of the physical tensors.
+  // 2. stores the new logical tensors back into the passed-in vector. This is
+  //    to avoid additional dynamic allocations.
+  void applyInplace(std::vector<Tensor>& physical_tensors) const;
+  std::bitset<kVmapNumLevels> levels_;
+};
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/OpMathType.h ADDED Viewed

	@@ -0,0 +1,69 @@

+#pragma once
+#include <c10/core/ScalarType.h>
+#include <c10/util/BFloat16.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
+#include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
+#include <c10/util/Half.h>
+namespace at {
+// For FP16 or BFloat16 inputs, ops should perform internal math in FP32.
+template <typename scalar_t>
+struct OpMathType {
+  using type = scalar_t;
+};
+template <>
+struct OpMathType<at::Half> {
+  using type = float;
+};
+template <>
+struct OpMathType<at::BFloat16> {
+  using type = float;
+};
+template <>
+struct OpMathType<at::Float8_e5m2> {
+  using type = float;
+};
+template <>
+struct OpMathType<at::Float8_e4m3fn> {
+  using type = float;
+};
+template <>
+struct OpMathType<at::Float8_e5m2fnuz> {
+  using type = float;
+};
+template <>
+struct OpMathType<at::Float8_e4m3fnuz> {
+  using type = float;
+};
+template <>
+struct OpMathType<c10::complex<Half>> {
+  using type = c10::complex<float>;
+};
+template <typename T>
+using opmath_type = typename OpMathType<T>::type;
+namespace {
+inline c10::ScalarType toOpMathType(const c10::ScalarType type) {
+  switch (type) {
+#define DEFINE_CASE(scalar_t, TypeNum) \
+  case ScalarType::TypeNum:            \
+    return CppTypeToScalarType<at::opmath_type<scalar_t>>::value;
+    AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
+#undef DEFINE_CASE
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unrecognized ScalarType: ", type);
+  }
+}
+} // namespace
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/RegistrationDeclarations.h ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ScalarOps.h ADDED Viewed

	@@ -0,0 +1,53 @@

+#pragma once
+#include <ATen/Tensor.h>
+#include <c10/core/Scalar.h>
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+#include <ATen/ops/scalar_tensor.h>
+#endif
+namespace at::detail {
+// When filling a number to 1-element CPU tensor, we want to skip
+// everything but manipulate data ptr directly.
+// Ideally this fast pass should be implemented in TensorIterator,
+// but we also want to skip compute_types which in not avoidable
+// in TensorIterator for now.
+Tensor& scalar_fill(Tensor& self, const Scalar& value);
+TORCH_API Tensor scalar_tensor_static(
+    const Scalar& s,
+    c10::optional<ScalarType> dtype_opt,
+    c10::optional<Device> device_opt);
+} // namespace at::detail
+// This is in the c10 namespace because we use ADL to find the functions in it.
+namespace c10 {
+// FIXME: this should be (and was) Scalar::toTensor, but there is currently no
+// way to implement this without going through Derived Types (which are not part
+// of core).
+inline at::Tensor scalar_to_tensor(
+    const Scalar& s,
+    const Device device = at::kCPU) {
+  // This is the fast track we have for CPU scalar tensors.
+  if (device == at::kCPU) {
+    return at::detail::scalar_tensor_static(s, s.type(), at::kCPU);
+  }
+  return at::scalar_tensor(s, at::device(device).dtype(s.type()));
+}
+} // namespace c10
+namespace at::native {
+inline Tensor wrapped_scalar_tensor(
+    const Scalar& scalar,
+    const Device device = at::kCPU) {
+  auto tensor = scalar_to_tensor(scalar, device);
+  tensor.unsafeGetTensorImpl()->set_wrapped_number(true);
+  return tensor;
+}
+} // namespace at::native

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Storage.h ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ #pragma once
2	+ #include <c10/core/Storage.h>

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/WrapDimUtils.h ADDED Viewed

	@@ -0,0 +1,153 @@

+#pragma once
+#include <ATen/core/IListRef.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/TensorImpl.h>
+#include <c10/core/WrapDimMinimal.h>
+#include <c10/util/irange.h>
+namespace at {
+// if dim_post_expr is 0 and wrap_scalar is true, then dim must be in the
+// range [-1, 0]. This is a special case for scalar tensors and manifests in
+// e.g. torch.sum(scalar_tensor, 0) Otherwise, dim should be in the range
+// [-dim_post_expr, dim_post_expr-1].
+using c10::maybe_wrap_dim;
+inline int64_t maybe_wrap_dim(int64_t dim, TensorImpl* tensor) {
+  return maybe_wrap_dim(dim, tensor->dim());
+}
+inline int64_t maybe_wrap_dim(int64_t dim, TensorList tensors) {
+  if (tensors.empty()) {
+    // can't wrap empty TensorList; rely on underlying implementation to throw
+    // error if necessary.
+    return dim;
+  }
+  return maybe_wrap_dim(dim, tensors[0].dim());
+}
+inline int64_t maybe_wrap_dim(
+    int64_t dim,
+    const std::vector<std::vector<int64_t>>& tensor_sizes) {
+  if (tensor_sizes.empty()) {
+    // can't wrap empty list; rely on underlying implementation to throw error
+    // if necessary
+    return dim;
+  }
+  return maybe_wrap_dim(dim, tensor_sizes[0].size());
+}
+// Given an array of dimensions `dims` of length `ndims`, this function "Wraps"
+// each dim in-place for a tensor of rank `dim_post_expr`, allowing dims to be
+// specified using negative indices.
+//
+// Additionally, if `wrap_scalar` is true then scalar tensors with rank 0, will
+// allow dimensions in the range [-1, 0]. Otherwise, an IndexError is raised for
+// dimensions not in the range [-dim_post_expr, dim_post_expr).
+inline void maybe_wrap_dims_n(
+    int64_t* dims,
+    int64_t ndims,
+    int64_t dim_post_expr,
+    bool wrap_scalars = true) {
+  if (dim_post_expr <= 0) {
+    if (wrap_scalars) {
+      dim_post_expr = 1; // this will make range [-1, 0]
+    } else {
+      TORCH_CHECK_INDEX(
+          ndims == 0,
+          "Dimension specified as ",
+          dims[0],
+          " but tensor has no dimensions");
+      return;
+    }
+  }
+  int64_t min = -dim_post_expr;
+  int64_t max = dim_post_expr - 1;
+  for (const auto i : c10::irange(ndims)) {
+    auto& dim = dims[i];
+    if (dim < min || dim > max) {
+      TORCH_CHECK_INDEX(
+          false,
+          "Dimension out of range (expected to be in range of [",
+          min,
+          ", ",
+          max,
+          "], but got ",
+          dim,
+          ")");
+    }
+    if (dim < 0)
+      dim += dim_post_expr;
+  }
+}
+// Given a contiguous container of dimensions `dims`, this function "Wraps"
+// each dim in-place for a tensor of rank `dim_post_expr`, allowing dims to be
+// specified using negative indices.
+//
+// Additionally, if `wrap_scalar` is true then scalar tensors with rank 0, will
+// allow dimensions in the range [-1, 0]. Otherwise, an IndexError is raised for
+// dimensions not in the range [-dim_post_expr, dim_post_expr).
+template <typename Container>
+inline void maybe_wrap_dims(
+    Container& dims,
+    int64_t dim_post_expr,
+    bool wrap_scalars = true) {
+  return maybe_wrap_dims_n(
+      dims.data(), dims.size(), dim_post_expr, wrap_scalars);
+}
+// previously, size [0] tensors were the only possible empty tensors; thus, it
+// wasn't possible to cat empty tensors unless all the other tensors were
+// 1-dimensional, so we allowed these tensors to be "skipped" (both for wrap
+// dimension behavior and dimension size checking). We maintain this behavior
+// for backwards compatibility, but only for this specific size (i.e. other
+// empty sizes are not skipped).
+template <typename T>
+inline int64_t _legacy_cat_wrap_dim(
+    int64_t dim,
+    const std::vector<std::vector<T>>& tensor_sizes) {
+  for (auto& sizes : tensor_sizes) {
+    if (sizes.size() == 1 && sizes[0] == 0) {
+      continue;
+    }
+    return maybe_wrap_dim(dim, sizes.size());
+  }
+  return dim;
+}
+inline int64_t legacy_cat_wrap_dim(
+    int64_t dim,
+    const std::vector<std::vector<int64_t>>& tensor_sizes) {
+  return _legacy_cat_wrap_dim<int64_t>(dim, tensor_sizes);
+}
+inline int64_t legacy_cat_wrap_dim_symint(
+    int64_t dim,
+    const std::vector<std::vector<c10::SymInt>>& tensor_sizes) {
+  return _legacy_cat_wrap_dim<c10::SymInt>(dim, tensor_sizes);
+}
+inline int64_t legacy_cat_wrap_dim(
+    int64_t dim,
+    const MaterializedITensorListRef& tensors) {
+  for (const Tensor& tensor : tensors) {
+    if (tensor.dim() == 1 && tensor.sizes()[0] == 0) {
+      continue;
+    }
+    return maybe_wrap_dim(dim, tensor.dim());
+  }
+  return dim;
+}
+// wrap negative dims in a vector
+inline void wrap_all_dims(
+    std::vector<int64_t>& dims_to_wrap,
+    int64_t tensor_total_dims) {
+  for (const auto i : c10::irange(dims_to_wrap.size())) {
+    dims_to_wrap[i] = maybe_wrap_dim(dims_to_wrap[i], tensor_total_dims);
+  }
+}
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/code_template.h ADDED Viewed

	@@ -0,0 +1,243 @@

+#pragma once
+#include <c10/util/irange.h>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+namespace at::jit {
+// A template environment is a mapping from template variable names, e.g.,
+// identifier (corresponding to $identifier) to their expansions.
+//
+// This template environment supports storing strings, numbers and lists
+// of strings, and can be chained together (so that lookup proceeds in
+// in the top level environment, and then recurses into a parent
+// environment if the key is not found.)
+struct TemplateEnv {
+  TemplateEnv() = default;
+  TemplateEnv(TemplateEnv& parent) : parent(&parent) {}
+  using string_list = std::vector<std::string>;
+  // Add a string 'v' to the map at key 'k'.
+  void s(const std::string& k, const std::string& v) {
+    strings_[k] = v;
+    lists_.erase(k);
+  }
+  // Add a number 'v' to the map at key 'k'
+  template <typename T>
+  void d(const std::string& k, const T& v) {
+    strings_[k] = c10::to_string(v);
+    lists_.erase(k);
+  }
+  // Retrieve the string representation of the value stored at 'k' from the map.
+  // Raises an exception if the key is not found.
+  const std::string& s(const std::string& k) const {
+    if (strings_.count(k) == 0) {
+      if (parent) {
+        return parent->s(k);
+      }
+      notFound(k);
+    }
+    return strings_.at(k);
+  }
+  // Store a list of strings 'v' in the map at 'k'.
+  void v(const std::string& k, const string_list& v) {
+    lists_[k] = v;
+    strings_.erase(k);
+  }
+  // Retrieve a list of strings stored at 'k' from the map.
+  // Raises an exception if the key is not found.
+  const string_list& v(const std::string& k) const {
+    if (lists_.count(k) == 0) {
+      if (parent) {
+        return parent->v(k);
+      }
+      notFound(k);
+    }
+    return lists_.at(k);
+  }
+  // Test if a string 'k' is a string (as opposed to a list.)
+  bool keyIsString(const std::string& k) const {
+    if (strings_.count(k) > 0)
+      return true;
+    if (lists_.count(k) > 0)
+      return false;
+    if (parent)
+      return parent->keyIsString(k);
+    notFound(k);
+  }
+ private:
+  [[noreturn]] void notFound(const std::string& k) const {
+    std::stringstream ss;
+    ss << "key not found: " << k;
+    throw std::logic_error(ss.str());
+  }
+  std::unordered_map<std::string, std::string> strings_;
+  std::unordered_map<std::string, string_list> lists_;
+  TemplateEnv* parent{nullptr};
+};
+/*
+# Match $identifier or ${identifier} and replace with the value in env.
+# If this identifier is at the beginning of whitespace on a line
+# and its value is a list then it is treated as
+# block substitution by indenting all lines of all elements.
+# If the identifier is on a line starting with non-whitespace and a list
+# then it is comma separated. ${,foo} will insert a comma before the list
+# if this list is not empty and ${foo,} will insert one after.
+*/
+struct CodeTemplate {
+  /* implicit */ CodeTemplate(std::string t) : template_text(std::move(t)) {}
+  std::string format(const TemplateEnv& env) const {
+    std::stringstream out;
+    size_t pos = 0;
+    size_t indent = 0;
+    bool all_whitespace = true;
+    while (pos < template_text.size()) {
+      char c = template_text[pos];
+      if (c == '$') {
+        std::stringstream kss;
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        bool comma_before;
+        // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+        bool comma_after;
+        size_t new_pos = parseKey(pos, kss, comma_before, comma_after);
+        std::string k = kss.str();
+        bool is_string = env.keyIsString(k);
+        if (all_whitespace) {
+          if (is_string)
+            emitStringWithIndents(out, indent, env.s(k));
+          else
+            emitLinesIndented(out, indent, env.v(k));
+        } else {
+          if (is_string)
+            out << env.s(k);
+          else
+            emitCommaSeparatedList(out, env.v(k), comma_before, comma_after);
+        }
+        all_whitespace = false;
+        pos = new_pos;
+      } else {
+        out << c;
+        if (!isspace(c))
+          all_whitespace = false;
+        indent++;
+        if (c == '\n') {
+          indent = 0;
+          all_whitespace = true;
+        }
+        pos++;
+      }
+    }
+    return out.str();
+  }
+ private:
+  using string_list = std::vector<std::string>;
+  char charAt(size_t p) const {
+    if (p >= template_text.size())
+      throw std::logic_error("EOS found in key");
+    return template_text[p];
+  }
+  size_t parseKey(
+      size_t pos,
+      std::ostream& k,
+      bool& comma_before,
+      bool& comma_after) const {
+    comma_before = false;
+    comma_after = false;
+    pos++;
+    if (charAt(pos) == '{') {
+      pos++;
+      if (charAt(pos) == ',') {
+        comma_before = true;
+        pos++;
+      }
+      pos = parseIdent(pos, k);
+      if (charAt(pos) == ',') {
+        comma_after = true;
+        pos++;
+      }
+      if (charAt(pos) != '}')
+        throw std::logic_error("missing terminating '}'");
+      pos++;
+      return pos;
+    } else {
+      return parseIdent(pos, k);
+    }
+  }
+  size_t parseIdent(size_t pos, std::ostream& k) const {
+    while (pos < template_text.size() &&
+           (isalnum(template_text[pos]) || template_text[pos] == '_')) {
+      k << template_text[pos];
+      pos++;
+    }
+    return pos;
+  }
+  void emitCommaSeparatedList(
+      std::ostream& out,
+      const string_list& strings,
+      bool comma_before,
+      bool comma_after) const {
+    if (comma_before && !strings.empty())
+      out << ", ";
+    for (const auto i : c10::irange(strings.size())) {
+      if (i > 0)
+        out << ", ";
+      out << strings[i];
+    }
+    if (comma_after && !strings.empty())
+      out << ", ";
+  }
+  // These indentation functions follow the convention that they never emit
+  // leading or trailing newlines when the input string does not have leading
+  // or trailing newlines. It's the responsibility of the calling function
+  // to indent correctly in the context.
+  void emitIndent(std::ostream& out, size_t indent) const {
+    for (C10_UNUSED const auto i : c10::irange(indent)) {
+      out << " ";
+    }
+  }
+  void emitStringWithIndents(
+      std::ostream& out,
+      size_t indent,
+      const std::string& str) const {
+    for (auto c : str) {
+      out << c;
+      if (c == '\n') {
+        emitIndent(out, indent);
+      }
+    }
+  }
+  void emitLinesIndented(
+      std::stringstream& out,
+      size_t indent,
+      const string_list& strings) const {
+    for (const auto i : c10::irange(strings.size())) {
+      if (i > 0)
+        emitIndent(out, indent);
+      emitStringWithIndents(out, indent, strings[i]);
+      if (i + 1 != strings.size())
+        out << "\n";
+    }
+  }
+  std::string template_text;
+};
+static inline std::string format(const std::string& fmt, TemplateEnv& env) {
+  return CodeTemplate(fmt).format(env);
+}
+} // namespace at::jit

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_adaptive_avg_pool2d_compositeexplicitautograd_dispatch.h ADDED Viewed

	@@ -0,0 +1,26 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace compositeexplicitautograd {
+TORCH_API at::Tensor & _adaptive_avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef output_size);
+TORCH_API at::Tensor & _adaptive_avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+TORCH_API at::Tensor & _adaptive_avg_pool2d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef output_size);
+TORCH_API at::Tensor & _adaptive_avg_pool2d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef output_size, at::Tensor & out);
+} // namespace compositeexplicitautograd
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cast_Short_native.h ADDED Viewed

	@@ -0,0 +1,21 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+namespace at {
+namespace native {
+TORCH_API at::Tensor _cast_Short(const at::Tensor & self, bool non_blocking=false);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_debug_has_internal_overlap_ops.h ADDED Viewed

	@@ -0,0 +1,28 @@

+#pragma once
+// @generated by torchgen/gen.py from Operator.h
+#include <tuple>
+#include <vector>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace _ops {
+struct TORCH_API _debug_has_internal_overlap {
+  using schema = int64_t (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_debug_has_internal_overlap")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_debug_has_internal_overlap(Tensor self) -> int")
+  static int64_t call(const at::Tensor & self);
+  static int64_t redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+}} // namespace at::_ops

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_fake_quantize_learnable_per_tensor_affine_cuda_dispatch.h ADDED Viewed

	@@ -0,0 +1,23 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace cuda {
+TORCH_API at::Tensor _fake_quantize_learnable_per_tensor_affine(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t quant_min, int64_t quant_max, double grad_factor=1.0);
+} // namespace cuda
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_flash_attention_backward_native.h ADDED Viewed

	@@ -0,0 +1,21 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _flash_attention_backward(const at::Tensor & grad_out, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & cum_seq_q, const at::Tensor & cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, bool is_causal, const at::Tensor & philox_seed, const at::Tensor & philox_offset, c10::optional<double> scale=c10::nullopt);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_acos_native.h ADDED Viewed

	@@ -0,0 +1,25 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+namespace at {
+namespace native {
+TORCH_API void _foreach_acos_out(at::TensorList self, at::TensorList out);
+TORCH_API ::std::vector<at::Tensor> foreach_tensor_acos_slow(at::TensorList self);
+TORCH_API void foreach_tensor_acos_slow_(at::TensorList self);
+TORCH_API ::std::vector<at::Tensor> foreach_tensor_acos_cuda(at::TensorList self);
+TORCH_API void foreach_tensor_acos_cuda_(at::TensorList self);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_linalg_slogdet_meta.h ADDED Viewed

	@@ -0,0 +1,27 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+namespace at {
+namespace meta {
+struct TORCH_API structured__linalg_slogdet : public at::impl::MetaBase {
+    void meta(const at::Tensor & A);
+};
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_linalg_svd_cpu_dispatch.h ADDED Viewed

	@@ -0,0 +1,25 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace cpu {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _linalg_svd(const at::Tensor & A, bool full_matrices=false, bool compute_uv=true, c10::optional<c10::string_view> driver=c10::nullopt);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _linalg_svd_out(at::Tensor & U, at::Tensor & S, at::Tensor & Vh, const at::Tensor & A, bool full_matrices=false, bool compute_uv=true, c10::optional<c10::string_view> driver=c10::nullopt);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> _linalg_svd_outf(const at::Tensor & A, bool full_matrices, bool compute_uv, c10::optional<c10::string_view> driver, at::Tensor & U, at::Tensor & S, at::Tensor & Vh);
+} // namespace cpu
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_logcumsumexp.h ADDED Viewed

	@@ -0,0 +1,39 @@

+#pragma once
+// @generated by torchgen/gen.py from Function.h
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <ATen/ops/_logcumsumexp_ops.h>
+namespace at {
+// aten::_logcumsumexp(Tensor self, int dim) -> Tensor
+inline at::Tensor _logcumsumexp(const at::Tensor & self, int64_t dim) {
+    return at::_ops::_logcumsumexp::call(self, dim);
+}
+// aten::_logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _logcumsumexp_out(at::Tensor & out, const at::Tensor & self, int64_t dim) {
+    return at::_ops::_logcumsumexp_out::call(self, dim, out);
+}
+// aten::_logcumsumexp.out(Tensor self, int dim, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & _logcumsumexp_outf(const at::Tensor & self, int64_t dim, at::Tensor & out) {
+    return at::_ops::_logcumsumexp_out::call(self, dim, out);
+}
+}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_native_multi_head_attention.h ADDED Viewed

	@@ -0,0 +1,39 @@

+#pragma once
+// @generated by torchgen/gen.py from Function.h
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <ATen/ops/_native_multi_head_attention_ops.h>
+namespace at {
+// aten::_native_multi_head_attention(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None) -> (Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor> _native_multi_head_attention(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask={}, bool need_weights=true, bool average_attn_weights=true, c10::optional<int64_t> mask_type=c10::nullopt) {
+    return at::_ops::_native_multi_head_attention::call(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, need_weights, average_attn_weights, mask_type);
+}
+// aten::_native_multi_head_attention.out(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+inline ::std::tuple<at::Tensor &,at::Tensor &> _native_multi_head_attention_out(at::Tensor & out0, at::Tensor & out1, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask={}, bool need_weights=true, bool average_attn_weights=true, c10::optional<int64_t> mask_type=c10::nullopt) {
+    return at::_ops::_native_multi_head_attention_out::call(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, need_weights, average_attn_weights, mask_type, out0, out1);
+}
+// aten::_native_multi_head_attention.out(Tensor query, Tensor key, Tensor value, int embed_dim, int num_head, Tensor qkv_weight, Tensor qkv_bias, Tensor proj_weight, Tensor proj_bias, Tensor? mask=None, bool need_weights=True, bool average_attn_weights=True, int? mask_type=None, *, Tensor(a!) out0, Tensor(b!) out1) -> (Tensor(a!), Tensor(b!))
+inline ::std::tuple<at::Tensor &,at::Tensor &> _native_multi_head_attention_outf(const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, int64_t embed_dim, int64_t num_head, const at::Tensor & qkv_weight, const at::Tensor & qkv_bias, const at::Tensor & proj_weight, const at::Tensor & proj_bias, const c10::optional<at::Tensor> & mask, bool need_weights, bool average_attn_weights, c10::optional<int64_t> mask_type, at::Tensor & out0, at::Tensor & out1) {
+    return at::_ops::_native_multi_head_attention_out::call(query, key, value, embed_dim, num_head, qkv_weight, qkv_bias, proj_weight, proj_bias, mask, need_weights, average_attn_weights, mask_type, out0, out1);
+}
+}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nested_tensor_storage_offsets_native.h ADDED Viewed

	@@ -0,0 +1,22 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+namespace at {
+namespace native {
+TORCH_API at::Tensor & _nested_tensor_storage_offsets_out(const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor _nested_tensor_storage_offsets(const at::Tensor & self);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_log_softmax_backward_data_native.h ADDED Viewed

	@@ -0,0 +1,23 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+namespace at {
+namespace native {
+TORCH_API at::Tensor & _sparse_log_softmax_backward_data_out(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, const at::Tensor & self, at::Tensor & out);
+TORCH_API at::Tensor log_softmax_backward_sparse_cpu(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, const at::Tensor & self);
+TORCH_API at::Tensor log_softmax_backward_sparse_cuda(const at::Tensor & grad_output, const at::Tensor & output, int64_t dim, const at::Tensor & self);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_to_sparse_native.h ADDED Viewed

	@@ -0,0 +1,28 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+namespace at {
+namespace native {
+TORCH_API at::Tensor & _to_sparse_sparse_dim_out(const at::Tensor & self, int64_t sparse_dim, at::Tensor & out);
+TORCH_API at::Tensor dense_to_sparse(const at::Tensor & self, int64_t sparse_dim);
+TORCH_API at::Tensor sparse_coo_to_sparse(const at::Tensor & self, int64_t sparse_dim);
+TORCH_API at::Tensor sparse_compressed_to_sparse(const at::Tensor & self, int64_t sparse_dim);
+TORCH_API at::Tensor & _to_sparse_out(const at::Tensor & self, c10::optional<at::Layout> layout, at::OptionalIntArrayRef blocksize, c10::optional<int64_t> dense_dim, at::Tensor & out);
+TORCH_API at::Tensor dense_to_sparse(const at::Tensor & self, c10::optional<at::Layout> layout=c10::nullopt, at::OptionalIntArrayRef blocksize=c10::nullopt, c10::optional<int64_t> dense_dim=c10::nullopt);
+TORCH_API at::Tensor sparse_coo_to_sparse(const at::Tensor & self, c10::optional<at::Layout> layout=c10::nullopt, at::OptionalIntArrayRef blocksize=c10::nullopt, c10::optional<int64_t> dense_dim=c10::nullopt);
+TORCH_API at::Tensor sparse_compressed_to_sparse(const at::Tensor & self, c10::optional<at::Layout> layout=c10::nullopt, at::OptionalIntArrayRef blocksize=c10::nullopt, c10::optional<int64_t> dense_dim=c10::nullopt);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_upsample_nearest_exact2d_backward_meta.h ADDED Viewed

	@@ -0,0 +1,27 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+namespace at {
+namespace meta {
+struct TORCH_API structured__upsample_nearest_exact2d_backward : public at::impl::MetaBase {
+    void meta(const at::Tensor & grad_output, at::ArrayRef<int64_t> output_size, at::ArrayRef<int64_t> input_size, c10::optional<double> scales_h, c10::optional<double> scales_w);
+};
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_validate_sparse_csr_tensor_args.h ADDED Viewed

	@@ -0,0 +1,30 @@

+#pragma once
+// @generated by torchgen/gen.py from Function.h
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <ATen/ops/_validate_sparse_csr_tensor_args_ops.h>
+namespace at {
+// aten::_validate_sparse_csr_tensor_args(Tensor crow_indices, Tensor col_indices, Tensor values, int[] size) -> ()
+inline void _validate_sparse_csr_tensor_args(const at::Tensor & crow_indices, const at::Tensor & col_indices, const at::Tensor & values, at::IntArrayRef size) {
+    return at::_ops::_validate_sparse_csr_tensor_args::call(crow_indices, col_indices, values, size);
+}
+}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/align_as_ops.h ADDED Viewed

	@@ -0,0 +1,28 @@

+#pragma once
+// @generated by torchgen/gen.py from Operator.h
+#include <tuple>
+#include <vector>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace _ops {
+struct TORCH_API align_as {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::align_as")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "align_as(Tensor self, Tensor other) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & other);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other);
+};
+}} // namespace at::_ops

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/aminmax_meta.h ADDED Viewed

	@@ -0,0 +1,27 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+namespace at {
+namespace meta {
+struct TORCH_API structured_aminmax : public at::impl::MetaBase {
+    void meta(const at::Tensor & self, c10::optional<int64_t> dim, bool keepdim);
+};
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ceil_ops.h ADDED Viewed

	@@ -0,0 +1,50 @@

+#pragma once
+// @generated by torchgen/gen.py from Operator.h
+#include <tuple>
+#include <vector>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace _ops {
+struct TORCH_API ceil {
+  using schema = at::Tensor (const at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::ceil")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "ceil(Tensor self) -> Tensor")
+  static at::Tensor call(const at::Tensor & self);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self);
+};
+struct TORCH_API ceil_ {
+  using schema = at::Tensor & (at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::ceil_")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "ceil_(Tensor(a!) self) -> Tensor(a!)")
+  static at::Tensor & call(at::Tensor & self);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, at::Tensor & self);
+};
+struct TORCH_API ceil_out {
+  using schema = at::Tensor & (const at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::ceil")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "ceil.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & self, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out);
+};
+}} // namespace at::_ops

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/constant_pad_nd_compositeexplicitautograd_dispatch.h ADDED Viewed

	@@ -0,0 +1,28 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace compositeexplicitautograd {
+TORCH_API at::Tensor constant_pad_nd(const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value=0);
+TORCH_API at::Tensor constant_pad_nd_symint(const at::Tensor & self, c10::SymIntArrayRef pad, const at::Scalar & value=0);
+TORCH_API at::Tensor & constant_pad_nd_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value=0);
+TORCH_API at::Tensor & constant_pad_nd_outf(const at::Tensor & self, at::IntArrayRef pad, const at::Scalar & value, at::Tensor & out);
+TORCH_API at::Tensor & constant_pad_nd_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef pad, const at::Scalar & value=0);
+TORCH_API at::Tensor & constant_pad_nd_symint_outf(const at::Tensor & self, c10::SymIntArrayRef pad, const at::Scalar & value, at::Tensor & out);
+} // namespace compositeexplicitautograd
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/dot_cuda_dispatch.h ADDED Viewed

	@@ -0,0 +1,23 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace cuda {
+TORCH_API at::Tensor dot(const at::Tensor & self, const at::Tensor & tensor);
+} // namespace cuda
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_hfft.h ADDED Viewed

	@@ -0,0 +1,91 @@

+#pragma once
+// @generated by torchgen/gen.py from Function.h
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <ATen/ops/fft_hfft_ops.h>
+namespace at {
+// aten::fft_hfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+inline at::Tensor fft_hfft(const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+    return at::_ops::fft_hfft::call(self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor fft_hfft(const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+    return at::_ops::fft_hfft::call(self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm);
+  }
+}
+// aten::fft_hfft(Tensor self, SymInt? n=None, int dim=-1, str? norm=None) -> Tensor
+inline at::Tensor fft_hfft_symint(const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+    return at::_ops::fft_hfft::call(self, n, dim, norm);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor fft_hfft(const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+    return at::_ops::fft_hfft::call(self, n, dim, norm);
+  }
+}
+// aten::fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & fft_hfft_out(at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+    return at::_ops::fft_hfft_out::call(self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & fft_hfft_out(at::Tensor & out, const at::Tensor & self, c10::optional<int64_t> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+    return at::_ops::fft_hfft_out::call(self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+  }
+}
+// aten::fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & fft_hfft_outf(const at::Tensor & self, c10::optional<int64_t> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+    return at::_ops::fft_hfft_out::call(self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, int64_t>::value>>
+  at::Tensor & fft_hfft_outf(const at::Tensor & self, c10::optional<int64_t> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+    return at::_ops::fft_hfft_out::call(self, n.has_value() ? c10::make_optional(c10::SymInt(*n)) : c10::nullopt, dim, norm, out);
+  }
+}
+// aten::fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & fft_hfft_symint_out(at::Tensor & out, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+    return at::_ops::fft_hfft_out::call(self, n, dim, norm, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & fft_hfft_out(at::Tensor & out, const at::Tensor & self, c10::optional<c10::SymInt> n=c10::nullopt, int64_t dim=-1, c10::optional<c10::string_view> norm=c10::nullopt) {
+    return at::_ops::fft_hfft_out::call(self, n, dim, norm, out);
+  }
+}
+// aten::fft_hfft.out(Tensor self, SymInt? n=None, int dim=-1, str? norm=None, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & fft_hfft_symint_outf(const at::Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+    return at::_ops::fft_hfft_out::call(self, n, dim, norm, out);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same<T, c10::SymInt>::value>>
+  at::Tensor & fft_hfft_outf(const at::Tensor & self, c10::optional<c10::SymInt> n, int64_t dim, c10::optional<c10::string_view> norm, at::Tensor & out) {
+    return at::_ops::fft_hfft_out::call(self, n, dim, norm, out);
+  }
+}
+}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_irfft2_native.h ADDED Viewed

	@@ -0,0 +1,22 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+namespace at {
+namespace native {
+TORCH_API at::Tensor fft_irfft2_symint(const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::IntArrayRef dim={-2,-1}, c10::optional<c10::string_view> norm=c10::nullopt);
+TORCH_API at::Tensor & fft_irfft2_symint_out(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::IntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_rfftn_native.h ADDED Viewed

	@@ -0,0 +1,22 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+namespace at {
+namespace native {
+TORCH_API at::Tensor fft_rfftn_symint(const at::Tensor & self, at::OptionalSymIntArrayRef s=c10::nullopt, at::OptionalIntArrayRef dim=c10::nullopt, c10::optional<c10::string_view> norm=c10::nullopt);
+TORCH_API at::Tensor & fft_rfftn_symint_out(const at::Tensor & self, at::OptionalSymIntArrayRef s, at::OptionalIntArrayRef dim, c10::optional<c10::string_view> norm, at::Tensor & out);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fix.h ADDED Viewed

	@@ -0,0 +1,44 @@

+#pragma once
+// @generated by torchgen/gen.py from Function.h
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <ATen/ops/fix_ops.h>
+namespace at {
+// aten::fix(Tensor self) -> Tensor
+inline at::Tensor fix(const at::Tensor & self) {
+    return at::_ops::fix::call(self);
+}
+// aten::fix_(Tensor(a!) self) -> Tensor(a!)
+inline at::Tensor & fix_(at::Tensor & self) {
+    return at::_ops::fix_::call(self);
+}
+// aten::fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & fix_out(at::Tensor & out, const at::Tensor & self) {
+    return at::_ops::fix_out::call(self, out);
+}
+// aten::fix.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
+inline at::Tensor & fix_outf(const at::Tensor & self, at::Tensor & out) {
+    return at::_ops::fix_out::call(self, out);
+}
+}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/float_power_compositeimplicitautograd_dispatch.h ADDED Viewed

	@@ -0,0 +1,33 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace compositeimplicitautograd {
+TORCH_API at::Tensor float_power(const at::Tensor & self, const at::Tensor & exponent);
+TORCH_API at::Tensor & float_power_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & exponent);
+TORCH_API at::Tensor & float_power_outf(const at::Tensor & self, const at::Tensor & exponent, at::Tensor & out);
+TORCH_API at::Tensor & float_power_(at::Tensor & self, const at::Tensor & exponent);
+TORCH_API at::Tensor float_power(const at::Scalar & self, const at::Tensor & exponent);
+TORCH_API at::Tensor & float_power_out(at::Tensor & out, const at::Scalar & self, const at::Tensor & exponent);
+TORCH_API at::Tensor & float_power_outf(const at::Scalar & self, const at::Tensor & exponent, at::Tensor & out);
+TORCH_API at::Tensor float_power(const at::Tensor & self, const at::Scalar & exponent);
+TORCH_API at::Tensor & float_power_out(at::Tensor & out, const at::Tensor & self, const at::Scalar & exponent);
+TORCH_API at::Tensor & float_power_outf(const at::Tensor & self, const at::Scalar & exponent, at::Tensor & out);
+TORCH_API at::Tensor & float_power_(at::Tensor & self, const at::Scalar & exponent);
+} // namespace compositeimplicitautograd
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/geometric_cpu_dispatch.h ADDED Viewed

	@@ -0,0 +1,23 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace cpu {
+TORCH_API at::Tensor & geometric_(at::Tensor & self, double p, c10::optional<at::Generator> generator=c10::nullopt);
+} // namespace cpu
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/glu_backward_ops.h ADDED Viewed

	@@ -0,0 +1,39 @@

+#pragma once
+// @generated by torchgen/gen.py from Operator.h
+#include <tuple>
+#include <vector>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace _ops {
+struct TORCH_API glu_backward_grad_input {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, int64_t, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::glu_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "grad_input")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "glu_backward.grad_input(Tensor grad_output, Tensor self, int dim, *, Tensor(a!) grad_input) -> Tensor(a!)")
+  static at::Tensor & call(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, at::Tensor & grad_input);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, int64_t dim, at::Tensor & grad_input);
+};
+struct TORCH_API glu_backward {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, int64_t);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::glu_backward")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "glu_backward(Tensor grad_output, Tensor self, int dim) -> Tensor")
+  static at::Tensor call(const at::Tensor & grad_output, const at::Tensor & self, int64_t dim);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_output, const at::Tensor & self, int64_t dim);
+};
+}} // namespace at::_ops

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hardshrink_native.h ADDED Viewed

	@@ -0,0 +1,23 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+#include <ATen/ops/hardshrink_meta.h>
+namespace at {
+namespace native {
+struct TORCH_API structured_hardshrink_out : public at::meta::structured_hardshrink {
+void impl(const at::Tensor & self, const at::Scalar & lambd, const at::Tensor & out);
+};
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hardtanh_backward_cpu_dispatch.h ADDED Viewed

	@@ -0,0 +1,25 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace cpu {
+TORCH_API at::Tensor hardtanh_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val);
+TORCH_API at::Tensor & hardtanh_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val);
+TORCH_API at::Tensor & hardtanh_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & min_val, const at::Scalar & max_val, at::Tensor & grad_input);
+} // namespace cpu
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/isposinf_cuda_dispatch.h ADDED Viewed

	@@ -0,0 +1,25 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace cuda {
+TORCH_API at::Tensor isposinf(const at::Tensor & self);
+TORCH_API at::Tensor & isposinf_out(at::Tensor & out, const at::Tensor & self);
+TORCH_API at::Tensor & isposinf_outf(const at::Tensor & self, at::Tensor & out);
+} // namespace cuda
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/le_compositeexplicitautogradnonfunctional_dispatch.h ADDED Viewed

	@@ -0,0 +1,26 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace compositeexplicitautogradnonfunctional {
+TORCH_API at::Tensor le(const at::Tensor & self, const at::Scalar & other);
+TORCH_API at::Tensor & le_(at::Tensor & self, const at::Scalar & other);
+TORCH_API at::Tensor le(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & le_(at::Tensor & self, const at::Tensor & other);
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_qr_compositeexplicitautogradnonfunctional_dispatch.h ADDED Viewed

	@@ -0,0 +1,23 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace compositeexplicitautogradnonfunctional {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor> linalg_qr(const at::Tensor & A, c10::string_view mode="reduced");
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_svd_native.h ADDED Viewed

	@@ -0,0 +1,22 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <tuple>
+#include <vector>
+namespace at {
+namespace native {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor> linalg_svd(const at::Tensor & A, bool full_matrices=true, c10::optional<c10::string_view> driver=c10::nullopt);
+TORCH_API ::std::tuple<at::Tensor &,at::Tensor &,at::Tensor &> linalg_svd_out(const at::Tensor & A, bool full_matrices, c10::optional<c10::string_view> driver, at::Tensor & U, at::Tensor & S, at::Tensor & Vh);
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_vector_norm_meta.h ADDED Viewed

	@@ -0,0 +1,27 @@

+#pragma once
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+namespace at {
+namespace meta {
+struct TORCH_API structured_linalg_vector_norm : public at::impl::MetaBase {
+    void meta(const at::Tensor & self, const at::Scalar & ord, at::OptionalIntArrayRef dim, bool keepdim, c10::optional<at::ScalarType> dtype);
+};
+} // namespace native
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/logaddexp2_cuda_dispatch.h ADDED Viewed

	@@ -0,0 +1,25 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace cuda {
+TORCH_API at::Tensor logaddexp2(const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & logaddexp2_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other);
+TORCH_API at::Tensor & logaddexp2_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out);
+} // namespace cuda
+} // namespace at

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool1d_with_indices.h ADDED Viewed

	@@ -0,0 +1,30 @@

+#pragma once
+// @generated by torchgen/gen.py from Function.h
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <c10/util/Optional.h>
+#include <ATen/ops/max_pool1d_with_indices_ops.h>
+namespace at {
+// aten::max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor> max_pool1d_with_indices(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, at::IntArrayRef dilation=1, bool ceil_mode=false) {
+    return at::_ops::max_pool1d_with_indices::call(self, kernel_size, stride, padding, dilation, ceil_mode);
+}
+}

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/max_pool2d_with_indices_ops.h ADDED Viewed

	@@ -0,0 +1,39 @@

+#pragma once
+// @generated by torchgen/gen.py from Operator.h
+#include <tuple>
+#include <vector>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace _ops {
+struct TORCH_API max_pool2d_with_indices_out {
+  using schema = ::std::tuple<at::Tensor &,at::Tensor &> (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool, at::Tensor &, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::max_pool2d_with_indices")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "max_pool2d_with_indices.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False, *, Tensor(a!) out, Tensor(b!) indices) -> (Tensor(a!), Tensor(b!))")
+  static ::std::tuple<at::Tensor &,at::Tensor &> call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out, at::Tensor & indices);
+  static ::std::tuple<at::Tensor &,at::Tensor &> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode, at::Tensor & out, at::Tensor & indices);
+};
+struct TORCH_API max_pool2d_with_indices {
+  using schema = ::std::tuple<at::Tensor,at::Tensor> (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, bool);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::max_pool2d_with_indices")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "max_pool2d_with_indices(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, int[2] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)")
+  static ::std::tuple<at::Tensor,at::Tensor> call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode);
+  static ::std::tuple<at::Tensor,at::Tensor> redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool ceil_mode);
+};
+}} // namespace at::_ops

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/miopen_convolution_relu_ops.h ADDED Viewed

	@@ -0,0 +1,28 @@

+#pragma once
+// @generated by torchgen/gen.py from Operator.h
+#include <tuple>
+#include <vector>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace _ops {
+struct TORCH_API miopen_convolution_relu {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const c10::optional<at::Tensor> &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::miopen_convolution_relu")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "")
+  STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "miopen_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor")
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const c10::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups);
+};
+}} // namespace at::_ops

tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/mkldnn_rnn_layer_backward_cpu_dispatch.h ADDED Viewed

	@@ -0,0 +1,23 @@

+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+namespace at {
+namespace cpu {
+TORCH_API ::std::tuple<at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor,at::Tensor> mkldnn_rnn_layer_backward(const at::Tensor & input, const at::Tensor & weight1, const at::Tensor & weight2, const at::Tensor & weight3, const at::Tensor & weight4, const at::Tensor & hx_, const at::Tensor & cx_tmp, const at::Tensor & output, const at::Tensor & hy_, const at::Tensor & cy_, const c10::optional<at::Tensor> & grad_output, const c10::optional<at::Tensor> & grad_hy, const c10::optional<at::Tensor> & grad_cy, bool reverse, int64_t mode, int64_t hidden_size, int64_t num_layers, bool has_biases, bool train, bool bidirectional, at::IntArrayRef batch_sizes, bool batch_first, const at::Tensor & workspace);
+} // namespace cpu
+} // namespace at