diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ArrayRef.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ArrayRef.h new file mode 100644 index 0000000000000000000000000000000000000000..0461d5953ed8a7783c82402ca4523b0b0a1ad465 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ArrayRef.h @@ -0,0 +1,2 @@ +#pragma once +#include diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Functions.h new file mode 100644 index 0000000000000000000000000000000000000000..8327d00ece5c0f2b2c1e81f88e54fa0c76d9e97d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Functions.h @@ -0,0 +1,1427 @@ +#pragma once + +// @generated by torchgen/gen.py from Functions.h + +#ifdef TORCH_ASSERT_NO_OPERATORS +#error This change adds a dependency on native_functions.yaml, \ + meaning the file will need to be re-compiled every time an operator \ + is changed or added. Consider if your change would be better placed in \ + another file, or if a more specific header might achieve the same goal. \ + See NOTE: [Tensor vs. TensorBase] +#endif + +#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS) +#error This change adds a dependency on all pytorch operators, meaning the \ + file will need to be re-compiled every time an operator is changed or added. \ + Consider including a specific operator from and \ + see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS]. +#endif + +// NOTE: [TORCH_ASSERT_ONLY_METHOD_OPERATORS] +// +// In ATen, certain generated headers files include the definitions of +// every single operator in PyTorch. Unfortunately this means every +// time an operator signature is updated or changed in +// native_functions.yaml, you (and every other PyTorch developer) need +// to recompile every source file that includes any of these headers. +// +// To break up these header dependencies, and improve incremental +// build times for all PyTorch developers. These headers are split +// into per-operator headers in the `ATen/ops` folder. This limits +// incremental builds to only changes to methods of `Tensor`, or files +// that use the specific operator being changed. With `at::sum` as an +// example, you should include +// +// // instead of ATen/Functions.h +// // instead of ATen/NativeFunctions.h +// // instead of ATen/Operators.h +// // instead of ATen/CPUFunctions.h +// +// However, even if you're careful to use this in your own code. +// `Functions.h` might be included indirectly through another header +// without you realising. To avoid this, you can add +// +// #define TORCH_ASSERT_ONLY_METHOD_OPERATORS +// +// to the top of your source file. This way any time the non-specific +// headers are included, the compiler will error out. +// +// Also, be aware that `ops` are not available in all build +// configurations (namely fb-internal) so you must guard these +// includes with `#ifdef AT_PER_OPERATOR_HEADERS`. e.g. +// +// #ifndef AT_PER_OPERATOR_HEADERS +// #include +// #else +// #include +// #endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { + + + +// Special C++ only overloads for std()-like functions (See gh-40287) +// These are needed because int -> bool conversion takes precedence over int -> IntArrayRef +// So, for example std(0) would select the std(unbiased=False) overload +TORCH_API inline Tensor var(const Tensor& self, int dim) { + return at::var(self, IntArrayRef{dim}); +} +TORCH_API inline std::tuple var_mean(const Tensor& self, int dim) { + return at::var_mean(self, IntArrayRef{dim}); +} +TORCH_API inline Tensor std(const Tensor& self, int dim) { + return at::std(self, IntArrayRef{dim}); +} +TORCH_API inline std::tuple std_mean(const Tensor& self, int dim) { + return at::std_mean(self, IntArrayRef{dim}); +} + +inline int64_t numel(const Tensor& tensor) { + return tensor.numel(); +} + +inline int64_t size(const Tensor& tensor, int64_t dim) { + return tensor.size(dim); +} + +inline int64_t stride(const Tensor& tensor, int64_t dim) { + return tensor.stride(dim); +} + +inline bool is_complex(const Tensor& tensor) { + return tensor.is_complex(); +} + +inline bool is_floating_point(const Tensor& tensor) { + return tensor.is_floating_point(); +} + +inline bool is_signed(const Tensor& tensor) { + return tensor.is_signed(); +} + +inline bool is_inference(const Tensor& tensor) { + return tensor.is_inference(); +} + +inline bool _is_zerotensor(const Tensor& tensor) { + return tensor._is_zerotensor(); +} + +inline bool is_conj(const Tensor& tensor) { + return tensor.is_conj(); +} + +inline Tensor conj(const Tensor& tensor) { + return tensor.conj(); +} + +inline bool is_neg(const Tensor& tensor) { + return tensor.is_neg(); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/MemoryOverlap.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/MemoryOverlap.h new file mode 100644 index 0000000000000000000000000000000000000000..d41324249b39bc4f061a9cca62799057ac76ec43 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/MemoryOverlap.h @@ -0,0 +1,42 @@ +#pragma once + +#include + +namespace c10 { +struct TensorImpl; +} + +namespace at { +class TensorBase; + +// MemOverlap: Whether or not there is memory overlap +// +// No: Absolutely no memory overlap +// Yes: Absolutely yes memory overlap +// TooHard: There might be memory overlap, but it was too expensive to compute. +// +// NB: Please update the python test for these if you renumber them. +enum class MemOverlap { No, Yes, TooHard }; + +enum class MemOverlapStatus { Full, Partial, No, TooHard }; + +TORCH_API MemOverlap has_internal_overlap(const TensorBase& t); +TORCH_API MemOverlap has_internal_overlap(c10::TensorImpl* t); + +TORCH_API void assert_no_internal_overlap(const TensorBase& t); +TORCH_API void assert_no_internal_overlap(c10::TensorImpl* t); + +TORCH_API MemOverlapStatus +get_overlap_status(const TensorBase& a, const TensorBase& b); +TORCH_API MemOverlapStatus +get_overlap_status(const c10::TensorImpl* a, const c10::TensorImpl* b); + +TORCH_API void assert_no_partial_overlap( + const TensorBase& a, + const TensorBase& b); +void assert_no_partial_overlap(c10::TensorImpl* a, c10::TensorImpl* b); + +TORCH_API void assert_no_overlap(const TensorBase& a, const TensorBase& b); +TORCH_API void assert_no_overlap(c10::TensorImpl* a, c10::TensorImpl* b); + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NativeMetaFunctions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NativeMetaFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..eabea23f7fbae46c2a291dae90c52067dbdc1e85 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NativeMetaFunctions.h @@ -0,0 +1,1303 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunctions.h + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { + +namespace meta { + + + +} // namespace meta +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NumericUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NumericUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..788da64b4e4274bcda9adce01c39902d2aae61ef --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/NumericUtils.h @@ -0,0 +1,203 @@ +#pragma once + +#ifdef __HIPCC__ +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace at { + +// std::isnan isn't performant to use on integral types; it will +// (uselessly) convert to floating point and then do the test. +// This function is. + +template , int> = 0> +inline C10_HOST_DEVICE bool _isnan(T /*val*/) { + return false; +} + +template , int> = 0> +inline C10_HOST_DEVICE bool _isnan(T val) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return ::isnan(val); +#else + return std::isnan(val); +#endif +} + +template ::value, int> = 0> +inline C10_HOST_DEVICE bool _isnan(T val) { + return std::isnan(val.real()) || std::isnan(val.imag()); +} + +template , int> = 0> +inline C10_HOST_DEVICE bool _isnan(T val) { + return at::_isnan(static_cast(val)); +} + +template < + typename T, + std::enable_if_t, int> = 0> +inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) { + return at::_isnan(static_cast(val)); +} + +inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) { + return at::_isnan(static_cast(val)); +} + +template < + typename T, + std::enable_if_t, int> = 0> +inline C10_HOST_DEVICE bool _isnan(T val) { + return val.isnan(); +} + +template < + typename T, + std::enable_if_t, int> = 0> +inline C10_HOST_DEVICE bool _isnan(T val) { + return val.isnan(); +} + +template < + typename T, + std::enable_if_t, int> = 0> +inline C10_HOST_DEVICE bool _isnan(T val) { + return val.isnan(); +} + +template < + typename T, + std::enable_if_t, int> = 0> +inline C10_HOST_DEVICE bool _isnan(T val) { + return val.isnan(); +} + +// std::isinf isn't performant to use on integral types; it will +// (uselessly) convert to floating point and then do the test. +// This function is. + +template , int> = 0> +inline C10_HOST_DEVICE bool _isinf(T /*val*/) { + return false; +} + +template , int> = 0> +inline C10_HOST_DEVICE bool _isinf(T val) { +#if defined(__CUDACC__) || defined(__HIPCC__) + return ::isinf(val); +#else + return std::isinf(val); +#endif +} + +inline C10_HOST_DEVICE bool _isinf(at::Half val) { + return at::_isinf(static_cast(val)); +} + +inline C10_HOST_DEVICE bool _isinf(at::BFloat16 val) { + return at::_isinf(static_cast(val)); +} + +inline C10_HOST_DEVICE bool _isinf(at::Float8_e5m2 val) { + return val.isinf(); +} + +inline C10_HOST_DEVICE bool _isinf(at::Float8_e4m3fn val) { + return false; +} + +inline C10_HOST_DEVICE bool _isinf(at::Float8_e5m2fnuz val) { + return false; +} + +inline C10_HOST_DEVICE bool _isinf(at::Float8_e4m3fnuz val) { + return false; +} + +template +C10_HOST_DEVICE inline T exp(T x) { + static_assert( + !std::is_same_v, + "this template must be used with float or less precise type"); +#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) + // use __expf fast approximation for peak bandwidth + return __expf(x); +#else + return ::exp(x); +#endif +} + +template <> +C10_HOST_DEVICE inline double exp(double x) { + return ::exp(x); +} + +template +C10_HOST_DEVICE inline T log(T x) { + static_assert( + !std::is_same_v, + "this template must be used with float or less precise type"); +#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) + // use __logf fast approximation for peak bandwidth + return __logf(x); +#else + return ::log(x); +#endif +} + +template <> +C10_HOST_DEVICE inline double log(double x) { + return ::log(x); +} + +template +C10_HOST_DEVICE inline T log1p(T x) { + static_assert( + !std::is_same_v, + "this template must be used with float or less precise type"); +#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) + // use __logf fast approximation for peak bandwidth + // NOTE: There is no __log1pf so unfortunately we lose precision. + return __logf(1.0f + x); +#else + return ::log1p(x); +#endif +} + +template <> +C10_HOST_DEVICE inline double log1p(double x) { + return ::log1p(x); +} + +template +C10_HOST_DEVICE inline T tan(T x) { + static_assert( + !std::is_same_v, + "this template must be used with float or less precise type"); +#if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) + // use __tanf fast approximation for peak bandwidth + return __tanf(x); +#else + return ::tan(x); +#endif +} + +template <> +C10_HOST_DEVICE inline double tan(double x) { + return ::tan(x); +} + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/OpaqueTensorImpl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/OpaqueTensorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..f71ae5358f29962bec24159ca40c10ec120f6ef1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/OpaqueTensorImpl.h @@ -0,0 +1,187 @@ +#pragma once + +#include +#include +#include +#include + +namespace at { + +// An "Opaque" TensorImpl -- there are no strides and (for now) +// even data() is not supported (thus no pointer arithmetic). + +// NOTE: We could allow data() in the future, but would have to ensure pointer +// arithmetic code is properly guarded. +// +// NOTE: This does not support resize_ (and other metadata-changing ops) because +// of `shallow_copy_and_detach`. We would need to define an interface to +// "shallow copy" in order to add support. + +template +struct TORCH_API OpaqueTensorImpl : public TensorImpl { + // public constructor for now... + OpaqueTensorImpl( + at::DispatchKeySet key_set, + const caffe2::TypeMeta data_type, + c10::Device device, + OpaqueHandle opaque_handle, + c10::IntArrayRef sizes, + bool is_non_overlapping_and_dense = true) + : TensorImpl(key_set, data_type, device), + opaque_handle_(std::move(opaque_handle)) { + set_storage_access_should_throw(); + set_custom_sizes_strides(SizesStridesPolicy::CustomStrides); + sizes_and_strides_.set_sizes(sizes); + refresh_numel(); + // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer) + is_non_overlapping_and_dense_ = is_non_overlapping_and_dense; + } + + // Destructor doesn't call release_resources because it's + // unnecessary; don't forget to change that if needed! + void release_resources() override { + TensorImpl::release_resources(); + opaque_handle_ = {}; + } + + void set_size(int64_t dim, int64_t new_size) override { + AT_ERROR("opaque tensors do not have set_size"); + } + + void set_stride(int64_t dim, int64_t new_stride) override { + AT_ERROR("opaque tensors do not have set_stride"); + } + + void set_storage_offset(int64_t storage_offset) override { + AT_ERROR("opaque tensors do not have set_storage_offset"); + } + +#ifdef DEBUG + bool has_storage() const override { + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + !storage_, "OpaqueTensorImpl assumes that storage_ is never set"); + return false; + } +#endif + + /** + * Return a TensorImpl that is a shallow-copy of this TensorImpl. + * + * For usage of `version_counter` and `allow_tensor_metadata_change`, + * see NOTE [ TensorImpl Shallow-Copying ]. + */ + c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const override { + auto impl = c10::make_intrusive>( + key_set(), + dtype(), + device(), + opaque_handle_, + sizes_and_strides_.sizes_arrayref()); + copy_tensor_metadata( + /*src_opaque_impl=*/this, + /*dest_opaque_impl=*/impl.get(), + /*version_counter=*/version_counter, + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); + impl->refresh_numel(); + return impl; + } + + /** + * Return a TensorImpl that is a shallow-copy of this TensorImpl. + * + * For usage of `version_counter` and `allow_tensor_metadata_change`, + * see NOTE [ TensorImpl Shallow-Copying ]. + */ + c10::intrusive_ptr shallow_copy_and_detach( + c10::VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const override { + auto impl = c10::make_intrusive>( + key_set(), + dtype(), + device(), + opaque_handle_, + sizes_and_strides_.sizes_arrayref()); + copy_tensor_metadata( + /*src_opaque_impl=*/this, + /*dest_opaque_impl=*/impl.get(), + /*version_counter=*/std::move(version_counter), + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); + impl->refresh_numel(); + return impl; + } + + /** + * Shallow-copies data from another TensorImpl into this TensorImpl. + * + * For why this function doesn't check this TensorImpl's + * `allow_tensor_metadata_change_`, see NOTE [ TensorImpl Shallow-Copying ]. + */ + void shallow_copy_from(const c10::intrusive_ptr& impl) override { + AT_ASSERT(has_compatible_shallow_copy_type(impl->key_set())); + auto opaque_impl = + static_cast*>(impl.get()); + copy_tensor_metadata( + /*src_impl=*/opaque_impl, + /*dest_impl=*/this, + /*version_counter=*/version_counter(), + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change()); + refresh_numel(); + } + + const OpaqueHandle& opaque_handle() const { + return opaque_handle_; + } + + OpaqueHandle& unsafe_opaque_handle() { + return opaque_handle_; + } + + protected: + /** + * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer / + * storage_offset) from one TensorImpl to another TensorImpl. + * + * For usage of `version_counter` and `allow_tensor_metadata_change`, see NOTE + * [ TensorImpl Shallow-Copying ]. + */ + static void copy_tensor_metadata( + const OpaqueTensorImpl* src_opaque_impl, + OpaqueTensorImpl* dest_opaque_impl, + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) { + TensorImpl::copy_tensor_metadata( + src_opaque_impl, + dest_opaque_impl, + version_counter, + allow_tensor_metadata_change); + + // OpaqueTensorImpl-specific fields. + dest_opaque_impl->opaque_handle_ = src_opaque_impl->opaque_handle_; + } + + static void copy_tensor_metadata( + const OpaqueTensorImpl* src_opaque_impl, + OpaqueTensorImpl* dest_opaque_impl, + c10::VariableVersion&& version_counter, + bool allow_tensor_metadata_change) { + TensorImpl::copy_tensor_metadata( + src_opaque_impl, + dest_opaque_impl, + std::move(version_counter), + allow_tensor_metadata_change); + + // OpaqueTensorImpl-specific fields. + dest_opaque_impl->opaque_handle_ = src_opaque_impl->opaque_handle_; + } + + private: + const char* tensorimpl_type_name() const override { + return "OpaqueTensorImpl"; + } + + OpaqueHandle opaque_handle_; +}; + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Operators.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Operators.h new file mode 100644 index 0000000000000000000000000000000000000000..42302ed16e2e4a5019208038bbf39b5ff6210af9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Operators.h @@ -0,0 +1,1358 @@ +#pragma once + +// @generated by torchgen/gen.py from Operators.h + +#ifdef TORCH_ASSERT_NO_OPERATORS +#error This change adds a dependency on native_functions.yaml, \ + meaning the file will need to be re-compiled every time an operator \ + is changed or added. Consider if your change would be better placed in \ + another file, or if a more specific header might achieve the same goal. \ + See NOTE: [Tensor vs. TensorBase] +#endif + +#if defined(AT_PER_OPERATOR_HEADERS) && defined(TORCH_ASSERT_ONLY_METHOD_OPERATORS) +#error This change adds a dependency on all pytorch operators, meaning the \ + file will need to be re-compiled every time an operator is changed or added. \ + Consider including a specific operator from \ + and see NOTE [TORCH_ASSERT_ONLY_METHOD_OPERATORS]. +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Extension writers: do you write wrapper functions? Are you frustrated with +// resolving overloads of operators? Are you frustrated with dealing with +// pointer-to-methods and resolving overloads of pointer-to-methods?? Look no +// further, this is the utility for you. +// +// Given an operator schema: aten::op.overload(... +// +// Use ATEN_FN2(op, overload) to get a *function* version of the operator +// that is guaranteed to not be overloaded. This means that you can safely +// decltype(&ATEN_FN2(op, overload)) it. NB: the 2 means this macro takes 2 args. +// +// Given an operator schema without an overload name: aten::op(... +// +// Use ATEN_FN(op) to get an unambiguous *function* version of the operator. +// +// There is some interesting behavior for out= operations. +// ATEN_FN2(sin, out) gives a function that is *faithful* to the schema; +// that is, the order of arguments is exactly what it looks like in the schema. + +#define ATEN_FN2(op_name, overload) at::_ops::op_name##_##overload::call +#define ATEN_FN(op_name) at::_ops::op_name::call + +// Separately, ATEN_OP(op) and ATEN_OP2(op, overload) define a class containing compile-time +// metadata about a given aten operator. +// Notable data on the class includes: +// - ATEN_OP2(add, Tensor)::name // returns the string name: "add" +// - ATEN_OP2(add, Tensor)::overload_name // returns the string overload name: "Tensor" +// - ATEN_OP2(add, Tensor)::schema // returns the C++ schema type: at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &) +// - ATEN_OP2(add, Tensor)::schema_str // returns the string jit type: "add.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) -> Tensor" + +#define ATEN_OP2(op_name, overload) at::_ops::op_name##_##overload +#define ATEN_OP(op_name) at::_ops::op_name + +// WARNING: Please do not call any of the ops in the _ops namespace directly. +// Use the ATEN_FN macros. We do not guarantee stability of the naming +// scheme for the functions in at::_ops + +// See Note [The ATen Operators API] for details of the at::_ops namespace + +namespace at { +namespace _ops { + +} // namespace _ops +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/PTThreadPool.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/PTThreadPool.h new file mode 100644 index 0000000000000000000000000000000000000000..1c910dfb97dce44748c054b457b400b13b1b9fda --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/PTThreadPool.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +namespace at { + +class TORCH_API PTThreadPool : public c10::ThreadPool { + public: + explicit PTThreadPool(int pool_size, int numa_node_id = -1) + : c10::ThreadPool(pool_size, numa_node_id, []() { + c10::setThreadName("PTThreadPool"); + at::init_num_threads(); + }) {} +}; + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelNativeTBB.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelNativeTBB.h new file mode 100644 index 0000000000000000000000000000000000000000..9193e06ed695233637fe5ee8344777e3e42c799b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ParallelNativeTBB.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include + +#include + +#ifdef _WIN32 +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#endif +#include + +#define INTRA_OP_PARALLEL + +namespace at::internal { + +template +inline void invoke_parallel( + const int64_t begin, + const int64_t end, + const int64_t grain_size, + const F& f) { + // Choose number of tasks based on grain size and number of threads. + int64_t chunk_size = divup((end - begin), get_num_threads()); + // Make sure each task is at least grain_size size. + chunk_size = std::max(grain_size, chunk_size); + + std::atomic_flag err_flag = ATOMIC_FLAG_INIT; + std::exception_ptr eptr; + tbb::parallel_for( + tbb::blocked_range(begin, end, chunk_size), + [&eptr, &err_flag, f](const tbb::blocked_range& r) { + try { + internal::ThreadIdGuard tid_guard( + tbb::this_task_arena::current_thread_index()); + f(r.begin(), r.end()); + } catch (...) { + if (!err_flag.test_and_set()) { + eptr = std::current_exception(); + } + } + }, + tbb::static_partitioner{}); + if (eptr) { + std::rethrow_exception(eptr); + } +} + +} // namespace at::internal diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/PythonTorchFunctionTLS.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/PythonTorchFunctionTLS.h new file mode 100644 index 0000000000000000000000000000000000000000..a8efa8eab357b55eb00d997ffe8458b841b0da36 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/PythonTorchFunctionTLS.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include + +namespace at::impl { + +enum TorchFunctionDisabledState { ENABLED, SUBCLASSES_DISABLED, ALL_DISABLED }; + +struct TORCH_API PythonTorchFunctionTLS { + static void set_disabled_state(TorchFunctionDisabledState disabled_state_); + static TorchFunctionDisabledState get_disabled_state(); + + static void push_onto_stack(std::shared_ptr mode); + static const std::shared_ptr pop_stack(); + static const std::shared_ptr& get_stack_at(int64_t idx); + static int64_t stack_len(); + + static const PythonTorchFunctionTLS& get_state(); + static void set_state(const PythonTorchFunctionTLS& state); + + private: + // The mode TLS is split into + // - disabled_state, which says which part of torch function are disabled + // - stack_, which is a vector of modes representing the stack of user + // defined modes + TorchFunctionDisabledState disabled_state_ = + TorchFunctionDisabledState::ENABLED; + std::vector> stack_; +}; + +TORCH_API bool torch_function_mode_enabled(); + +} // namespace at::impl diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SequenceNumber.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SequenceNumber.h new file mode 100644 index 0000000000000000000000000000000000000000..41b7b97cf6abbdcf987c020e14b09a64f7729bfc --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SequenceNumber.h @@ -0,0 +1,13 @@ +#pragma once + +#include +#include + +// A simple thread local enumeration, used to link forward and backward pass +// ops and is used by autograd and observers framework +namespace at::sequence_number { + +TORCH_API uint64_t peek(); +TORCH_API uint64_t get_and_increment(); + +} // namespace at::sequence_number diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SparseTensorImpl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SparseTensorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..dfb84c02bb34cbc9313d5efaac57069de17d77c4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/SparseTensorImpl.h @@ -0,0 +1,400 @@ +#pragma once + +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#include +#endif + +namespace at { +struct TORCH_API SparseTensorImpl : public TensorImpl { + // Stored in COO format, indices + values. + + // INVARIANTS: + // sparse_dim: range [0, len(shape)]; sparse_dim + dense_dim = len(shape) + // dense_dim : range [0, len(shape)]; sparse_dim + dense_dim = len(shape) + // _indices.shape: dimensionality: 2, shape: (sparse_dim, nnz) + // _values.shape: dimensionality: 1 + dense_dim. shape: (nnz, + // shape[sparse_dim:]) + + int64_t sparse_dim_ = 0; // number of sparse dimensions + int64_t dense_dim_ = 0; // number of dense dimensions + + Tensor indices_; // always a LongTensor + Tensor values_; + + // A sparse tensor is 'coalesced' if every index occurs at most once in + // the indices tensor, and the indices are in sorted order. (This means + // that it is very easy to convert a coalesced tensor to CSR format: you + // need only compute CSR format indices.) + // + // Most math operations can only be performed on coalesced sparse tensors, + // because many algorithms proceed by merging two sorted lists (of indices). + bool coalesced_ = false; + + // compute_numel with integer multiplication overflow check, see gh-57542 + void refresh_numel() { + TensorImpl::safe_refresh_numel(); + } + + public: + // Public for now... + explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta); + + void release_resources() override; + + int64_t nnz() const { + return values_.size(0); + } + + c10::SymInt sym_nnz() const { + return values_.sym_size(0); + } + int64_t sparse_dim() const { + return sparse_dim_; + } + int64_t dense_dim() const { + return dense_dim_; + } + bool coalesced() const { + return coalesced_; + } + Tensor indices() const { + return indices_; + } + Tensor values() const { + return values_; + } + + void set_size(int64_t dim, int64_t new_size) override; + void set_stride(int64_t dim, int64_t new_stride) override; + void set_storage_offset(int64_t storage_offset) override; + +#ifdef DEBUG + bool has_storage() const override; +#endif + + // WARNING: This function does NOT preserve invariants of sparse_dim/dense_dim + // with respect to indices and values + void raw_resize_(int64_t sparse_dim, int64_t dense_dim, IntArrayRef size) { + TORCH_CHECK( + allow_tensor_metadata_change(), + "raw_resize_ ", + err_msg_tensor_metadata_change_not_allowed); + TORCH_CHECK( + !has_symbolic_sizes_strides_, + "raw_resize_ called on tensor with symbolic shape") + set_sizes_and_strides(size, std::vector(size.size())); + sparse_dim_ = sparse_dim; + dense_dim_ = dense_dim; + refresh_numel(); + } + + // NOTE: This function preserves invariants of sparse_dim/dense_dim with + // respect to indices and values. + // + // NOTE: This function supports the following cases: + // 1. When we keep the number of dense dimensions unchanged, and NOT shrinking + // the size of any of the dense dimensions. + // 2. When we keep the number of sparse dimensions unchanged, and NOT + // shrinking the size of any of the sparse dimensions. + // 3. When the sparse tensor has zero nnz, in which case we are free to change + // the shapes of both its sparse and dense dimensions. + // + // This function DOESN'T support (and will throw an error) the following + // cases: + // 1. When we attempt to change the number of sparse dimensions on a non-empty + // sparse tensor (such an operation will invalidate the indices stored). + // 2. When we attempt to change the number of dense dimensions on a non-empty + // sparse tensor (such an operation will behave differently from an equivalent + // dense tensor's resize method, and for API consistency we don't support it). + // 3. When we attempt to shrink the size of any of the dense dimensions on a + // non-empty sparse tensor (such an operation will behave differently from an + // equivalent dense tensor's resize method, and for API consistency we don't + // support it). + // 4. When we attempt to shrink the size of any of the sparse dimensions on a + // non-empty sparse tensor (this could make some of the stored indices + // out-of-bound and thus unsafe). + template + void _resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef size) { + TORCH_CHECK( + allow_tensor_metadata_change(), + "resize_ ", + err_msg_tensor_metadata_change_not_allowed); + TORCH_CHECK( + !has_symbolic_sizes_strides_, + "resize_ called on tensor with symbolic shape") + TORCH_CHECK( + sparse_dim + dense_dim == static_cast(size.size()), + "number of dimensions must be sparse_dim (", + sparse_dim, + ") + dense_dim (", + dense_dim, + "), but got ", + size.size()); + if (nnz() > 0) { + auto alt_options_msg = + "You could try the following options:\n\ +1. If you need an empty sparse tensor of this size, call `x = torch.sparse_coo_tensor(size)`.\n\ +2. If you need to resize this tensor, you have the following options:\n\ + 1. For both sparse and dense dimensions, keep the number of them constant and the size of them non-shrinking, and then try the same call again.\n\ + 2. Or, create a new sparse tensor with the correct indices and values from this sparse tensor."; + + TORCH_CHECK( + sparse_dim == sparse_dim_, + "changing the number of sparse dimensions (from ", + sparse_dim_, + " to ", + sparse_dim, + ") on a non-empty sparse tensor is not supported.\n", + alt_options_msg); + + TORCH_CHECK( + dense_dim == dense_dim_, + "changing the number of dense dimensions (from ", + dense_dim_, + " to ", + dense_dim, + ") on a non-empty sparse tensor is not supported.\n", + alt_options_msg); + + bool shrinking_sparse_dims = false; + bool shrinking_dense_dim = false; + auto sparse_size_original = generic_sizes().slice(0, sparse_dim); + auto sparse_size_new = size.slice(0, sparse_dim); + for (const auto i : c10::irange(sparse_dim)) { + if (sparse_size_new[i] < sparse_size_original[i]) { + shrinking_sparse_dims = true; + break; + } + } + auto dense_size_original = generic_sizes().slice(sparse_dim); + auto dense_size_new = size.slice(sparse_dim); + for (const auto i : c10::irange(dense_dim)) { + if (dense_size_new[i] < dense_size_original[i]) { + shrinking_dense_dim = true; + break; + } + } + + TORCH_CHECK( + !shrinking_sparse_dims, + "shrinking the size of sparse dimensions (from ", + sparse_size_original, + " to ", + sparse_size_new, + ") on a non-empty sparse tensor is not supported.\n", + alt_options_msg); + + TORCH_CHECK( + !shrinking_dense_dim, + "shrinking the size of dense dimensions (from ", + dense_size_original, + " to ", + dense_size_new, + ") on a non-empty sparse tensor is not supported.\n", + alt_options_msg); + } + + auto sizes_and_strides = generic_sizes(); + const bool size_equals_sizes = std::equal( + size.begin(), + size.end(), + sizes_and_strides.begin(), + sizes_and_strides.end()); + if ((!size_equals_sizes) || (sparse_dim != sparse_dim_) || + (dense_dim != dense_dim_)) { + auto nnz = at::symint::sizes(values())[0]; + std::vector values_size = {nnz}; + auto dense_size = size.slice(sparse_dim); + values_size.insert( + values_size.end(), dense_size.begin(), dense_size.end()); + at::symint::resize_(values_, values_size); + at::symint::resize_(indices_, {T(sparse_dim), nnz}); + } + + if (!size_equals_sizes) { + set_sizes_and_strides(size, std::vector(size.size())); + } + sparse_dim_ = sparse_dim; + dense_dim_ = dense_dim; + refresh_numel(); + } + + void resize_(int64_t sparse_dim, int64_t dense_dim, ArrayRef size) { + return _resize_(sparse_dim, dense_dim, size); + } + + void resize_( + int64_t sparse_dim, + int64_t dense_dim, + ArrayRef size) { + return _resize_(sparse_dim, dense_dim, size); + } + + // NOTE: this function will resize the sparse tensor and also set `indices` + // and `values` to empty. + void resize_and_clear_( + int64_t sparse_dim, + int64_t dense_dim, + IntArrayRef size) { + TORCH_CHECK( + allow_tensor_metadata_change(), + "resize_and_clear_ ", + err_msg_tensor_metadata_change_not_allowed); + TORCH_CHECK( + !has_symbolic_sizes_strides_, + "resize_and_clear_ called on tensor with symbolic shape") + TORCH_CHECK( + sparse_dim + dense_dim == static_cast(size.size()), + "number of dimensions must be sparse_dim (", + sparse_dim, + ") + dense_dim (", + dense_dim, + "), but got ", + size.size()); + + set_sizes_and_strides(size, std::vector(size.size())); + sparse_dim_ = sparse_dim; + dense_dim_ = dense_dim; + + auto empty_indices = at::empty({sparse_dim, 0}, indices().options()); + std::vector values_size = {0}; + auto dense_size = sizes().slice(sparse_dim); + values_size.insert(values_size.end(), dense_size.begin(), dense_size.end()); + auto empty_values = at::empty(values_size, values().options()); + set_indices_and_values_unsafe(empty_indices, empty_values); + refresh_numel(); + } + + void set_coalesced(bool coalesced) { + TORCH_CHECK( + allow_tensor_metadata_change(), + "set_coalesced ", + err_msg_tensor_metadata_change_not_allowed); + coalesced_ = coalesced; + } + + // NOTE: this function is only used internally and not exposed to Python + // frontend + void set_nnz_and_narrow(int64_t new_nnz) { + TORCH_CHECK( + allow_tensor_metadata_change(), + "set_nnz_and_narrow ", + err_msg_tensor_metadata_change_not_allowed); + AT_ASSERT(new_nnz <= nnz()); + indices_ = indices_.narrow(1, 0, new_nnz); + values_ = values_.narrow(0, 0, new_nnz); + if (new_nnz < 2) { + coalesced_ = true; + } + } + + // Takes indices and values and directly puts them into the sparse tensor, no + // copy. NOTE: this function is unsafe because it doesn't check whether any + // indices are out of boundaries of `sizes`, so it should ONLY be used where + // we know that the indices are guaranteed to be within bounds. This used to + // be called THSTensor_(_move) NB: This used to be able to avoid a refcount + // bump, but I was too lazy to make it happen + void set_indices_and_values_unsafe( + const Tensor& indices, + const Tensor& values); + + /** + * Return a TensorImpl that is a shallow-copy of this TensorImpl. + * + * For usage of `version_counter` and `allow_tensor_metadata_change`, + * see NOTE [ TensorImpl Shallow-Copying ]. + */ + c10::intrusive_ptr shallow_copy_and_detach( + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const override { + auto impl = c10::make_intrusive(key_set(), dtype()); + copy_tensor_metadata( + /*src_sparse_impl=*/this, + /*dest_sparse_impl=*/impl.get(), + /*version_counter=*/version_counter, + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); + impl->refresh_numel(); + return impl; + } + + /** + * Return a TensorImpl that is a shallow-copy of this TensorImpl. + * + * For usage of `version_counter` and `allow_tensor_metadata_change`, + * see NOTE [ TensorImpl Shallow-Copying ]. + */ + c10::intrusive_ptr shallow_copy_and_detach( + c10::VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const override { + auto impl = c10::make_intrusive(key_set(), dtype()); + copy_tensor_metadata( + /*src_sparse_impl=*/this, + /*dest_sparse_impl=*/impl.get(), + /*version_counter=*/std::move(version_counter), + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); + impl->refresh_numel(); + return impl; + } + + /** + * Shallow-copies data from another TensorImpl into this TensorImpl. + * + * For why this function doesn't check this TensorImpl's + * `allow_tensor_metadata_change_`, see NOTE [ TensorImpl Shallow-Copying ]. + */ + void shallow_copy_from(const c10::intrusive_ptr& impl) override { + AT_ASSERT(has_compatible_shallow_copy_type(impl->key_set())); + auto sparse_impl = static_cast(impl.get()); + copy_tensor_metadata( + /*src_sparse_impl=*/sparse_impl, + /*dest_sparse_impl=*/this, + /*version_counter=*/version_counter(), + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change()); + refresh_numel(); + } + + private: + explicit SparseTensorImpl( + at::DispatchKeySet, + const caffe2::TypeMeta, + at::Tensor indices, + at::Tensor values); + + /** + * Copy the tensor metadata fields (e.g. sizes / strides / storage pointer / + * storage_offset) from one TensorImpl to another TensorImpl. + * + * For usage of `version_counter` and `allow_tensor_metadata_change`, see NOTE + * [ TensorImpl Shallow-Copying ]. + */ + static void copy_tensor_metadata( + const SparseTensorImpl* src_sparse_impl, + SparseTensorImpl* dest_sparse_impl, + c10::VariableVersion version_counter, + bool allow_tensor_metadata_change) { + TensorImpl::copy_tensor_metadata( + src_sparse_impl, + dest_sparse_impl, + std::move(version_counter), + allow_tensor_metadata_change); + + // Sparse-specific fields + dest_sparse_impl->sparse_dim_ = src_sparse_impl->sparse_dim(); + dest_sparse_impl->dense_dim_ = src_sparse_impl->dense_dim(); + dest_sparse_impl->indices_ = src_sparse_impl->indices(); + dest_sparse_impl->values_ = src_sparse_impl->values(); + dest_sparse_impl->coalesced_ = src_sparse_impl->coalesced(); + } + + const char* tensorimpl_type_name() const override; +}; + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/StorageUtils.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/StorageUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..f7a9fdab0cc732378cb6d64bfcbc219dafdd4ec3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/StorageUtils.h @@ -0,0 +1,49 @@ +#pragma once + +#include +#include +#include + +namespace at { + +class TensorBase; + +// Here we define a series of utils to create/manipulate ATen backed +// c10 storage implementations. + +/** + * Create a new shared memory storage impl managed by file descriptor + * + * @param size size in bytes + */ +C10_EXPORT c10::intrusive_ptr new_shm_fd_storage(size_t size); + +/** + * Copy src to dst + * Caller must guarantee the validness of the storage objects + * during the entire copy process, esp. when it's async. + * + * This can probably live in c10 namespace later if needed, + * but for now keep it in at to keep implementation simple. + * + * @param dst dst tensor + * @param src src tensor + * @param non_blocking (default false) whether this operation blocks caller + */ +C10_EXPORT void storage_copy( + c10::Storage& dst, + const c10::Storage& src, + bool non_blocking = false); + +/** + * In place change the storage to shm based. + * + * This is only applicable to CPU tensors not already shared. + * Otherwise, it's a no op to mirror the THP tensor behavior: + * https://pytorch.org/docs/stable/generated/torch.Tensor.share_memory_.html + * + * @param t a tensor + */ +C10_EXPORT void share_memory_(TensorBase& t); + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIndexing.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIndexing.h new file mode 100644 index 0000000000000000000000000000000000000000..b6bb4710900c5570496785c502e492ec76520d93 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TensorIndexing.h @@ -0,0 +1,735 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + +#include + +#include + +namespace at::indexing { + +constexpr int64_t INDEX_MIN = c10::SymInt::min_representable_int(); +constexpr int64_t INDEX_MAX = -(INDEX_MIN + 1); + +enum class TensorIndexType { None, Ellipsis, SymInt, Boolean, Slice, Tensor }; + +constexpr c10::nullopt_t None = c10::nullopt; + +struct TORCH_API EllipsisIndexType final { + EllipsisIndexType() = default; +}; +TORCH_API extern const EllipsisIndexType Ellipsis; + +struct TORCH_API Slice final { + public: + Slice( + c10::optional start_index = c10::nullopt, + c10::optional stop_index = c10::nullopt, + c10::optional step_index = c10::nullopt) { + if (!step_index.has_value()) { + step_ = c10::SymInt(1); + } else { + step_ = std::move(step_index).value(); + } + + TORCH_CHECK_VALUE(step_ != 0, "slice step cannot be zero"); + + if (!start_index.has_value()) { + start_ = c10::SymInt(step_ < 0 ? INDEX_MAX : 0); + } else { + start_ = std::move(start_index).value(); + } + + if (!stop_index.has_value()) { + stop_ = c10::SymInt(step_ < 0 ? INDEX_MIN : INDEX_MAX); + } else { + stop_ = std::move(stop_index).value(); + } + } + + inline c10::SymInt start() const { + return start_; + } + + inline c10::SymInt stop() const { + return stop_; + } + + inline c10::SymInt step() const { + return step_; + } + + private: + c10::SymInt start_; + c10::SymInt stop_; + c10::SymInt step_; +}; + +TORCH_API std::ostream& operator<<(std::ostream& stream, const Slice& slice); + +// `at::indexing::TensorIndex` is used for converting C++ tensor indices such as +// `{None, "...", Ellipsis, 0, true, Slice(1, None, 2), torch::tensor({1, 2})}` +// into its equivalent `std::vector`, so that further tensor +// indexing operations can be performed using the supplied indices. +// +// There is one-to-one correspondence between Python and C++ tensor index types: +// Python | C++ +// ----------------------------------------------------- +// `None` | `at::indexing::None` +// `Ellipsis` | `at::indexing::Ellipsis` +// `...` | `"..."` +// `123` | `123` +// `True` / `False` | `true` / `false` +// `:` | `Slice()` / `Slice(None, None)` +// `::` | `Slice()` / `Slice(None, None, None)` +// `1:` | `Slice(1, None)` +// `1::` | `Slice(1, None, None)` +// `:3` | `Slice(None, 3)` +// `:3:` | `Slice(None, 3, None)` +// `::2` | `Slice(None, None, 2)` +// `1:3` | `Slice(1, 3)` +// `1::2` | `Slice(1, None, 2)` +// `:3:2` | `Slice(None, 3, 2)` +// `1:3:2` | `Slice(1, 3, 2)` +// `torch.tensor([1, 2])`) | `torch::tensor({1, 2})` +struct TORCH_API TensorIndex final { + // Case 1: `at::indexing::None` + TensorIndex(c10::nullopt_t) : type_(TensorIndexType::None) {} + + // Case 2: "..." / `at::indexing::Ellipsis` + TensorIndex(at::indexing::EllipsisIndexType) + : type_(TensorIndexType::Ellipsis) {} + TensorIndex(const char* str) : TensorIndex(at::indexing::Ellipsis) { + TORCH_CHECK_VALUE( + strcmp(str, "...") == 0, + "Expected \"...\" to represent an ellipsis index, but got \"", + str, + "\""); + } + + // Case 3: (Sym) Integer value + TensorIndex(SymInt integer) + : integer_(std::move(integer)), type_(TensorIndexType::SymInt) {} + TensorIndex(int64_t integer) : TensorIndex(SymInt(integer)) {} + TensorIndex(int integer) : TensorIndex(SymInt(integer)) {} + + // Case 4: Boolean value + template >> + TensorIndex(T boolean) : boolean_(boolean), type_(TensorIndexType::Boolean) {} + + // Case 5: Slice represented in `at::indexing::Slice` form + TensorIndex(Slice slice) + : slice_(std::move(slice)), type_(TensorIndexType::Slice) {} + + // Case 6: Tensor value + TensorIndex(Tensor tensor) + : tensor_(std::move(tensor)), type_(TensorIndexType::Tensor) {} + + inline bool is_none() const { + return type_ == TensorIndexType::None; + } + + inline bool is_ellipsis() const { + return type_ == TensorIndexType::Ellipsis; + } + + inline bool is_integer() const { + return type_ == TensorIndexType::SymInt; + } + + inline SymInt integer() const { + return integer_; + } + + inline bool is_boolean() const { + return type_ == TensorIndexType::Boolean; + } + + inline bool boolean() const { + return boolean_; + } + + inline bool is_slice() const { + return type_ == TensorIndexType::Slice; + } + + inline const Slice& slice() const { + return slice_; + } + + inline bool is_tensor() const { + return type_ == TensorIndexType::Tensor; + } + + inline const Tensor& tensor() const { + return tensor_; + } + + private: + SymInt integer_ = 0; + bool boolean_ = false; + Slice slice_; + Tensor tensor_; + TensorIndexType type_; +}; + +TORCH_API std::ostream& operator<<( + std::ostream& stream, + const TensorIndex& tensor_index); +TORCH_API std::ostream& operator<<( + std::ostream& stream, + const std::vector& tensor_indices); + +namespace impl { +static inline Tensor applySlice( + const Tensor& self, + int64_t dim, + c10::SymInt start, + c10::SymInt stop, + c10::SymInt step, + bool disable_slice_optimization, + const at::Device& self_device, + const c10::optional& self_sizes) { + // TODO: implement negative step + TORCH_CHECK_VALUE(step > 0, "step must be greater than zero"); + + // See NOTE [nested tensor size for indexing] + if (self_sizes.has_value()) { + // Skip this optimization if we are tracing, as the trace may be polymorphic + // over the shape of the `self` tensor, and we still want to record + // the slice. + SymInt length = (self_device == at::kCPU || self_device == at::kCUDA) + ? (*self_sizes)[dim] + : self.sym_size(dim); + if (!disable_slice_optimization && + TORCH_GUARD_SIZE_OBLIVIOUS(start.sym_eq(0)) && length == stop && + step == 1) { + return self; + } + } + return self.slice_symint( + dim, std::move(start), std::move(stop), std::move(step)); +} + +static inline Tensor applySelect( + const Tensor& self, + int64_t dim, + SymInt index, + int64_t real_dim, + const at::Device& /*self_device*/, + const c10::optional& self_sizes) { + // See NOTE [nested tensor size for indexing] + if (self_sizes.has_value()) { + auto maybe_index = index.maybe_as_int(); + if (maybe_index.has_value()) { + TORCH_CHECK_INDEX( + !(maybe_index.value() == 0 && dim == 0 && self_sizes->empty()), + "invalid index of a 0-dim tensor. ", + "Use `tensor.item()` in Python or `tensor.item()` in C++ to convert a 0-dim tensor to a number"); + } + + auto size = (*self_sizes)[dim]; + // Note: `size >= -index` is not equivalent to `size > -1 - index` if index + // is INT64_MIN For std::numeric_limits::min() result of unary + // minus is undefined by the standard but in practice is equal to self. On + // the other hand, indexing wraping is valid for all negative int64_t + // values, as x[INT64_MIN] is the same as x[INT64_MAX] + TORCH_CHECK_INDEX( + size > -1 - index && size > index, + "index ", + index, + " is out of bounds for dimension ", + real_dim, + " with size ", + size); + } + + // if the index is negative, do not normalize it because that would fix the + // index on the current tensor size in the tracer. aten::select also works on + // negative indices + return self.select_symint(dim, std::move(index)); +} + +static inline Tensor boolToIndexingTensorCPUOrCUDA( + const Tensor& self, + bool value) { + // booleans add a dimension of size 1. true indexes this dimension as if 0:, + // false as empty. + if (value) { + return at::empty({1}, self.options().dtype(kLong)).fill_(0.); + } else { + return at::empty({0}, self.options().dtype(kLong)); + } +} + +static inline Tensor boolToIndexingTensorNonNativeDeviceType( + const Tensor& self, + bool value) { + // booleans add a dimension of size 1. true indexes this dimension as if 0:, + // false as empty. + if (value) { + return at::zeros({1}, self.options().dtype(kLong)); + } else { + return at::empty({0}, self.options().dtype(kLong)); + } +} + +static inline Tensor boolToIndexingTensor( + const Tensor& self, + bool value, + const at::Device& self_device) { + if (self_device == at::kCPU || self_device == at::kCUDA) { + return boolToIndexingTensorCPUOrCUDA(self, value); + } else { + return boolToIndexingTensorNonNativeDeviceType(self, value); + } +} + +static inline Tensor scalarToTensorNonNativeDeviceType( + const Scalar& v, + const TensorOptions& options) { + return at::scalar_tensor(v, options); +} + +static inline void recordTensorIndex( + const Tensor& tensor, + std::vector& outIndices, + int64_t* dim_ptr) { + // TODO: check scalarType + outIndices.resize(*dim_ptr + 1); + outIndices[*dim_ptr] = tensor; + (*dim_ptr)++; +}; + +static inline c10::List> typeConvertIndices( + const Tensor& /*self*/, + std::vector&& indices) { + c10::List> converted_inds; + converted_inds.reserve(indices.size()); + for (auto&& i : std::move(indices)) { + converted_inds.push_back(std::move(i)); + } + return converted_inds; +} + +// NOTE: Why do we mirror instead of replace the `count_specified_dimensions` +// function in torch/csrc/autograd/python_variable_indexing.cpp? It's because +// `count_specified_dimensions` is on the hot path of Python tensor multi-dim +// indexing (i.e. it's called by `applySlicing` which is called by +// `THPVariable_getitem` / `THPVariable_setitem` when handling indexing of more +// than one dimension). If we were to merge the Python/C++ +// `count_specified_dimensions` function, on the Python side we would have to +// construct a `std::vector` container to be consumed by the C++ +// `count_specified_dimensions` function, which adds 100s of nanoseconds +// overhead and is undesirable. +static inline int64_t count_specified_dimensions( + const ArrayRef& indices) { + // Count the number of indexed dimensions (everything but ellipsis and None) + int64_t count = 0; + for (auto& obj : indices) { + if (obj.is_tensor()) { + auto& tensor = obj.tensor(); + if (tensor.scalar_type() == kByte || tensor.scalar_type() == kBool) { + count += tensor.dim(); + } else { + count++; + } + } else if (!obj.is_none() && !obj.is_ellipsis() && !obj.is_boolean()) { + count++; + } + } + return count; +} +} // namespace impl + +// NOTE: Many functions below are only for consumption from Python indexing +// implementation, they include: +// +// - `Tensor scalarToTensor(...)` +// - `IntArrayRef slicePrefix1sSize(...)` +// - `void copy_to(...)` +// - `Tensor handleDimInMultiDimIndexing(...)` +// - `Tensor dispatch_index(...)` +// - `Tensor dispatch_index_put_(...)` +// - `Tensor get_item(...)` +// - `void set_item(...)` +// +// The rest of the functions are in `at::indexing::impl` namespace, signifying +// that they shouldn't be used from Python indexing implementation. +static inline Tensor scalarToTensor( + const Scalar& v, + const TensorOptions& options, + const at::Device& self_device) { + if (self_device == at::kCPU && !v.isSymbolic()) { + return at::detail::scalar_tensor_static( + v, options.dtype_opt()->toScalarType(), self_device); + } else { + return impl::scalarToTensorNonNativeDeviceType(v, options); + } +} + +// To match numpy semantics: +// As a special case for backwards compatibility, +// strip away unit dimensions from the left of 'src' +static inline SymIntArrayRef slicePrefix1sSize(const SymIntArrayRef& sizes) { + size_t first_non1_src = sizes.size(); + for (const auto i : c10::irange(sizes.size())) { + // Unbacked SymInt has different behavior, but this is sound because + // failing to slice will only ever cause an error, not divergent + // behavior + if (!sizes[i].has_hint() || sizes[i] != 1) { + first_non1_src = i; + break; + } + } + + return sizes.slice(first_non1_src); +} + +static inline void copy_to(const Tensor& dst, const Tensor& src) { + if (dst.sym_sizes().equals(src.sym_sizes())) { + // A shortcut to avoid generating hard-coded constant sizes during tracing. + // This is not a perfect solution: when src & dst have different shapes, + // constants will still appear. Users can workaround that case by + // dst[index..] = src.reshape(..) + dst.copy_(src); + return; + } else if (src.dim() == 0 && src.device().type() == at::kCPU) { + dst.fill_(src); + return; + } + auto src_view = src.view_symint(slicePrefix1sSize(src.sym_sizes())); + c10::MaybeOwned b_src = expand_inplace(dst, src_view, "setitem"); + dst.copy_(*b_src); +} + +// See NOTE [ Setting `disable_slice_optimization` when calling C++ tensor +// indexing functions from Python ] +static inline Tensor handleDimInMultiDimIndexing( + const Tensor& prev_dim_result, + const Tensor& original_tensor, + const TensorIndex& index, + int64_t* dim_ptr, + int64_t* specified_dims_ptr, + int64_t real_dim, + std::vector& outIndices, + bool disable_slice_optimization, + const at::Device& original_tensor_device, + const c10::optional& prev_dim_result_sizes) { + if (index.is_integer()) { + return impl::applySelect( + prev_dim_result, + *dim_ptr, + index.integer(), + real_dim, + original_tensor_device, + prev_dim_result_sizes); + } else if (index.is_slice()) { + Tensor result = impl::applySlice( + prev_dim_result, + *dim_ptr, + index.slice().start(), + index.slice().stop(), + index.slice().step(), + /*disable_slice_optimization=*/disable_slice_optimization, + original_tensor_device, + prev_dim_result_sizes); + (*dim_ptr)++; + return result; + } else if (index.is_ellipsis()) { + (*dim_ptr) += original_tensor.dim() - (*specified_dims_ptr); + return prev_dim_result; + } else if (index.is_none()) { + Tensor result = prev_dim_result.unsqueeze(*dim_ptr); + (*dim_ptr)++; + return result; + } else if (index.is_boolean()) { + Tensor result = prev_dim_result.unsqueeze(*dim_ptr); + impl::recordTensorIndex( + impl::boolToIndexingTensor( + result, index.boolean(), original_tensor_device), + outIndices, + dim_ptr); + return result; + } else if (index.is_tensor()) { + Tensor result = prev_dim_result; + const Tensor& tensor = index.tensor(); + auto scalar_type = tensor.scalar_type(); + if (tensor.dim() == 0 && + at::isIntegralType(scalar_type, /*includeBool=*/true)) { + if (scalar_type != at::kByte && scalar_type != at::kBool) { + result = impl::applySelect( + result, + *dim_ptr, + tensor.item(), + real_dim, + original_tensor_device, + prev_dim_result_sizes); + } else { + result = result.unsqueeze(*dim_ptr); + if (scalar_type == at::kBool) { + impl::recordTensorIndex( + impl::boolToIndexingTensor( + result, tensor.item() != 0, original_tensor_device), + outIndices, + dim_ptr); + } else { + impl::recordTensorIndex( + impl::boolToIndexingTensor( + result, tensor.item() != 0, original_tensor_device), + outIndices, + dim_ptr); + } + } + } else { + impl::recordTensorIndex(tensor, outIndices, dim_ptr); + } + return result; + } else { + TORCH_INTERNAL_ASSERT(false, "Invalid TensorIndex type"); + } +} + +namespace impl { +// This mirrors `applySlicing` in +// torch/csrc/autograd/python_variable_indexing.cpp +static inline Tensor applySlicing( + const Tensor& self, + const ArrayRef& indices, + std::vector& outIndices, + bool disable_slice_optimization, + const at::Device& self_device, + const c10::optional& self_sizes) { + int64_t dim = 0; + int64_t specified_dims = impl::count_specified_dimensions(indices); + + // See NOTE [nested tensor size for indexing] + if (self_sizes.has_value()) { + TORCH_CHECK_INDEX( + specified_dims <= (int64_t)self_sizes->size(), + "too many indices for tensor of dimension ", + (int)self_sizes->size()); + } + + Tensor result = self; + for (const auto i : c10::irange(indices.size())) { + auto& obj = indices[i]; + // See NOTE [nested tensor size for indexing] + c10::optional result_sizes = result.is_nested() + ? c10::optional(c10::nullopt) + : c10::optional(result.sym_sizes()); + result = handleDimInMultiDimIndexing( + /*prev_dim_result=*/result, + /*original_tensor=*/self, + /*index=*/obj, + /*dim_ptr=*/&dim, + /*specified_dims_ptr=*/&specified_dims, + /*real_dim=*/static_cast(i), + /*outIndices=*/outIndices, + /*disable_slice_optimization=*/disable_slice_optimization, + /*original_tensor_device=*/self_device, + /*prev_dim_result_sizes=*/result_sizes); + } + return result; +} +} // namespace impl + +static inline Tensor dispatch_index( + const Tensor& self, + std::vector&& indices) { + return self.index(impl::typeConvertIndices(self, std::move(indices))); +} + +static inline Tensor dispatch_index_put_( + Tensor& self, + std::vector&& indices, + const Tensor& value) { + return self.index_put_( + impl::typeConvertIndices(self, std::move(indices)), value); +} + +// NOTE [ Setting `disable_slice_optimization` when calling C++ tensor indexing +// functions from Python ] +// +// Question: When should we set `disable_slice_optimization` to `true` when +// calling C++ tensor indexing functions from Python indexing code? +// +// Answer: What "slice optimization" means: when we have a slicing expression +// like `x[0:5, 0]`, where the sliced tensor was of size 5 in dimension 0, we +// would skip dispatching the actual slice call as an optimization. However, +// here are the cases where we DON'T want this optimization: +// +// 1. When we are doing 1-D slicing (e.g. `tensor[:]`). +// Reason: we always return a shallow copy for expressions such as +// `tensor[:]` / `tensor[...]` / `tensor[:, :]`. (Note that for `tensor[:, +// :]`, we return an alias of `tensor` by doing the following: +// ``` +// Tensor sliced = impl::applySlicing(self, indices, tensorIndices, +// disable_slice_optimization, self_device, self_sizes); if +// (tensorIndices.empty()) { +// if (sliced.is_same(self)) { +// // ensure we return a shallow copy for things like x[...] +// sliced = at::alias(sliced); +// } +// return sliced; +// } +// ```) +// 2. When we are doing JIT tracing. +// Reason: JIT tracing needs the `self.slice(...)` call to properly trace the +// slice operation. + +// This mirrors `THPVariable_getitem` in +// torch/csrc/autograd/python_variable_indexing.cpp See NOTE [ Setting +// `disable_slice_optimization` when calling C++ tensor indexing functions from +// Python ] +static inline Tensor get_item( + const Tensor& self, + const ArrayRef& indices, + bool disable_slice_optimization = false) { + at::Device self_device = self.device(); + // NOTE [nested tensor size for indexing] + // nested tensor does not have a size (yet) so for now we represent its size + // as null may need to be changed after we reach a better solution for nested + // tensor size + c10::optional self_sizes = self.is_nested() + ? c10::optional(c10::nullopt) + : c10::optional(self.sym_sizes()); + + // handle simple types: integers, slices, none, ellipsis, bool + if (indices.size() == 1) { + const TensorIndex& index = indices[0]; + if (index.is_integer()) { + return impl::applySelect( + self, 0, index.integer(), 0, self_device, self_sizes); + } else if (index.is_slice()) { + return impl::applySlice( + self, + 0, + index.slice().start(), + index.slice().stop(), + index.slice().step(), + /*disable_slice_optimization=*/true, + self_device, + self_sizes); + } else if (index.is_none()) { + return self.unsqueeze(0); + } else if (index.is_ellipsis()) { + return at::alias(self); + } else if (index.is_boolean()) { + Tensor result = self.unsqueeze(0); + return dispatch_index( + result, + std::vector{impl::boolToIndexingTensor( + result, index.boolean(), self_device)}); + } + } + + std::vector tensorIndices; + Tensor sliced = impl::applySlicing( + self, + indices, + tensorIndices, + disable_slice_optimization, + self_device, + self_sizes); + if (tensorIndices.empty()) { + if (sliced.is_same(self)) { + // ensure we return a shallow copy for things like x[...] + sliced = at::alias(sliced); + } + return sliced; + } + + // indexing by tensors ("advanced" indexing) + return dispatch_index(sliced, std::move(tensorIndices)); +} + +// This mirrors `THPVariable_setitem` in +// torch/csrc/autograd/python_variable_indexing.cpp for "the assigned value is a +// Tensor" case See NOTE [ Setting `disable_slice_optimization` when calling C++ +// tensor indexing functions from Python ] +static inline void set_item( + const Tensor& self, + const ArrayRef& indices, + const Tensor& value, + bool disable_slice_optimization = false) { + at::Device self_device = self.device(); + SymIntArrayRef self_sizes = self.sym_sizes(); + + // handle simple types: integers, slices, ellipsis, bool + if (indices.size() == 1) { + const TensorIndex& index = indices[0]; + if (index.is_boolean() && !index.boolean()) { + // do nothing for false (technically we should check the size, but we + // don't have real 0-sized shapes. + return; + } else if (index.is_ellipsis()) { + copy_to(self, value); + return; + } else if (index.is_none() || (index.is_boolean() && index.boolean())) { + copy_to(self.unsqueeze(0), value); + return; + } else if (index.is_integer()) { + copy_to( + impl::applySelect( + self, 0, index.integer(), 0, self_device, self_sizes), + value); + return; + } else if (index.is_slice()) { + copy_to( + impl::applySlice( + self, + 0, + index.slice().start(), + index.slice().stop(), + index.slice().step(), + /*disable_slice_optimization=*/disable_slice_optimization, + self_device, + self_sizes), + value); + return; + } + } + + std::vector tensorIndices; + Tensor sliced = impl::applySlicing( + self, + indices, + tensorIndices, + disable_slice_optimization, + self_device, + self_sizes); + if (tensorIndices.empty()) { + copy_to(sliced, value); + return; + } + + SymIntArrayRef valueSizes = value.sym_sizes(); + SymIntArrayRef slicedValueSizes = slicePrefix1sSize(valueSizes); + Tensor valuesSliced; + if (!valueSizes.equals(slicedValueSizes)) { + valuesSliced = value.view_symint(slicedValueSizes); + } else { + valuesSliced = value; + } + dispatch_index_put_(sliced, std::move(tensorIndices), valuesSliced); + return; +} + +} // namespace at::indexing diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ThreadLocalPythonObjects.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ThreadLocalPythonObjects.h new file mode 100644 index 0000000000000000000000000000000000000000..c45de86db3abeffe22cb8db559f602d88b35be9c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ThreadLocalPythonObjects.h @@ -0,0 +1,21 @@ +#pragma once + +#include +#include +#include + +namespace at::impl { + +struct TORCH_API ThreadLocalPythonObjects { + static void set(const std::string& key, std::shared_ptr value); + static const std::shared_ptr& get(const std::string& key); + static bool contains(const std::string& key); + + static const ThreadLocalPythonObjects& get_state(); + static void set_state(ThreadLocalPythonObjects state); + + private: + std::unordered_map> obj_dict_; +}; + +} // namespace at::impl diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ThreadLocalState.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ThreadLocalState.h new file mode 100644 index 0000000000000000000000000000000000000000..8419499c3a563c1001a13685278523a64495c3db --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ThreadLocalState.h @@ -0,0 +1,113 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace at { + +// Thread local state contains values that are preserved across +// thread boundaries (e.g. at::launch/JIT fork, autograd). +// Note at::parallel_for doesn't preserve TLS across thread boundaries. +class TORCH_API ThreadLocalState { + public: + // Saves the thread local variables' values and + // returns them as a ThreadLocalState + ThreadLocalState(); + + // set_grad_mode - force the value of the grad mode TLS in + // the current state object. This is used for example in the + // autograd engine. + void set_grad_mode(bool enabled); + + // set_multithreading_enabled - force the value of the multithreadinmaximum + // threads TLS in + // the current state object. This is used for example in the + // autograd engine. + void set_multithreading_enabled(bool enabled); + + // Sets thread local variables in the current thread, + // according to the thread boundary specified + static void setThreadLocalState(const ThreadLocalState& state); + + private: + c10::impl::LocalDispatchKeySet dispatch_key_; + + // ThreadLocalDebugInfo does not change after being created + // with DebugInfoGuard + std::shared_ptr debug_info_; + + // RecordFunction TLS + RecordFunctionTLS rf_tls_; + + // TLS for out-of-tree functorch + // See NOTE [functorch TLS in pytorch/pytorch] for why this needs to be a + // pointer (spoiler alert: it's due to the indirection) + // This needs to be a shared_ptr instead of a unique_ptr because + // ThreadLocalState is copy-able and does indeed get copied. Maybe we can + // consider adding an explicit copy constructor for ThreadLocalState in the + // future but I didn't want to add one just for this. + std::shared_ptr functorch_tls_; + + // TLS for AutogradModes + AutogradState autograd_tls_; + + // TLS for enable_torch_dispatch_mode + c10::impl::TorchDispatchModeTLS torch_dispatch_mode_state_; + + // TLS for enable_python_dispatcher + c10::impl::PyInterpreter* python_dispatcher_state_; + + // TLS for __torch_function__ (mode and disable_torch_function) + at::impl::PythonTorchFunctionTLS python_torch_function_state_; + + // TLS for saved tensors default hooks + at::impl::SavedTensorDefaultHooksTLS saved_tensors_default_hooks_state_; + + bool functionalization_reapply_views_state_; + + // TLS for arbitrary python objects that is registered via hooks + at::impl::ThreadLocalPythonObjects saved_objects_; + + friend class ThreadLocalStateGuard; +}; + +// Guard to set and reset the thread local state +class TORCH_API ThreadLocalStateGuard { + public: + explicit ThreadLocalStateGuard(const ThreadLocalState& state) + : prev_state_(ThreadLocalState()) { + // set the given state across the thread boundary + ThreadLocalState::setThreadLocalState(state); + } + + ~ThreadLocalStateGuard() { + // restore previously set variables + ThreadLocalState::setThreadLocalState(prev_state_); + } + + private: + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) + const ThreadLocalState prev_state_; +}; + +template +auto wrapPropagateTLSState(T callback) { + return [tls_state = ThreadLocalState(), + callback = std::move(callback)](auto&&... args) { + ThreadLocalStateGuard g(tls_state); + // Propagate value returned by callback(). + return callback(std::forward(args)...); + }; +} + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TypeDefault.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TypeDefault.h new file mode 100644 index 0000000000000000000000000000000000000000..0361b9d97d9e7148a5eeb5c9ffa09e815723beb4 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/TypeDefault.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { +struct Storage; +} + +namespace at { + +class Tensor; +using TensorList = ArrayRef; + +class Context; +struct Generator; + +struct Quantizer; +// This is temporary typedef to enable Quantizer in aten native function API +// we'll remove them when we are actually exposing Quantizer class +// to frontend +using ConstQuantizerPtr = const c10::intrusive_ptr&; + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Version.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Version.h new file mode 100644 index 0000000000000000000000000000000000000000..706da58a5da01c35fda7f2c6374c8f5868f1b642 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/Version.h @@ -0,0 +1,18 @@ +#include + +namespace at { + +/// Returns a detailed string describing the configuration PyTorch. +TORCH_API std::string show_config(); + +TORCH_API std::string get_mkl_version(); + +TORCH_API std::string get_mkldnn_version(); + +TORCH_API std::string get_openmp_version(); + +TORCH_API std::string get_cxx_flags(); + +TORCH_API std::string get_cpu_capability(); + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/Atomic.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/Atomic.cuh new file mode 100644 index 0000000000000000000000000000000000000000..56ee8f87e253049421c1268a21258f52a729c3b2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/Atomic.cuh @@ -0,0 +1,508 @@ +#pragma once + +#include +#include +#include + +#include + +#if !(defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)))) +#include +#endif + +template +struct AtomicFPOp; + +template <> +struct AtomicFPOp { + template + inline __device__ at::Half operator() (at::Half *address, at::Half val, const func_t& func) { + unsigned int * address_as_ui = + (unsigned int *) ((char *)address - ((size_t)address & 2)); + unsigned int old = *address_as_ui; + unsigned int assumed; + + at::Half hsum; + do { + assumed = old; + hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); + hsum = func(hsum, val); + old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; + old = atomicCAS(address_as_ui, assumed, old); + } while (assumed != old); + hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); + return hsum; + } +}; + +template <> +struct AtomicFPOp { + template + inline __device__ at::BFloat16 operator() (at::BFloat16 *address, at::BFloat16 val, const func_t& func) { + unsigned int * address_as_ui = + (unsigned int *) ((char *)address - ((size_t)address & 2)); + unsigned int old = *address_as_ui; + unsigned int assumed; + + at::BFloat16 bsum; + do { + assumed = old; + bsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); + bsum = func(bsum, val); + old = (size_t)address & 2 ? (old & 0xffff) | (bsum.x << 16) : (old & 0xffff0000) | bsum.x; + old = atomicCAS(address_as_ui, assumed, old); + } while (assumed != old); + bsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); + return bsum.x; + } +}; + +template <> +struct AtomicFPOp { + template + inline __device__ double operator() (double * address, double val, const func_t& func) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull; + unsigned long long int assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, func(val, assumed)); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __longlong_as_double(old); + } +}; + +#define ATOMIC_INTEGER_IMPL(NAME) \ +template \ +struct Atomic##NAME##IntegerImpl; \ + \ +template \ +struct Atomic##NAME##IntegerImpl { \ + template \ + inline __device__ void operator()(T *address, T val, const func_t& func) { \ + size_t offset = (size_t)address & 3; \ + uint32_t * address_as_ui = (uint32_t *)((char *)address - offset); \ + uint32_t old = *address_as_ui; \ + uint32_t shift = offset * 8; \ + uint32_t old_byte; \ + uint32_t newval; \ + uint32_t assumed; \ + \ + do { \ + assumed = old; \ + old_byte = (old >> shift) & 0xff; \ + newval = static_cast(func(val, static_cast(old_byte))); \ + newval = (old & ~(0x000000ff << shift)) | (newval << shift); \ + old = atomicCAS(address_as_ui, assumed, newval); \ + } while (assumed != old); \ + } \ +}; \ + \ +template \ +struct Atomic##NAME##IntegerImpl { \ + template \ + inline __device__ void operator()(T *address, T val, const func_t& func) { \ + size_t offset = (size_t)address & 2; \ + uint32_t * address_as_ui = (uint32_t *)((char *)address - offset); \ + bool is_32_align = offset; \ + uint32_t old = *address_as_ui; \ + uint32_t old_bytes; \ + uint32_t newval; \ + uint32_t assumed; \ + \ + do { \ + assumed = old; \ + old_bytes = is_32_align ? old >> 16 : old & 0xffff; \ + newval = static_cast(func(val, static_cast(old_bytes))); \ + newval = is_32_align ? (old & 0xffff) | (newval << 16) : (old & 0xffff0000) | newval; \ + old = atomicCAS(address_as_ui, assumed, newval); \ + } while (assumed != old); \ + } \ +}; \ + \ +template \ +struct Atomic##NAME##IntegerImpl { \ + template \ + inline __device__ void operator()(T *address, T val, const func_t& func) { \ + uint32_t * address_as_ui = (uint32_t *) (address); \ + uint32_t old = *address_as_ui; \ + uint32_t newval; \ + uint32_t assumed; \ + \ + do { \ + assumed = old; \ + newval = static_cast(func(val, static_cast(old))); \ + old = atomicCAS(address_as_ui, assumed, newval); \ + } while (assumed != old); \ + } \ +}; \ + \ +template \ +struct Atomic##NAME##IntegerImpl { \ + template \ + inline __device__ void operator()(T *address, T val, const func_t& func) { \ + unsigned long long * address_as_ui = (unsigned long long *) (address); \ + unsigned long long old = *address_as_ui; \ + unsigned long long newval; \ + unsigned long long assumed; \ + \ + do { \ + assumed = old; \ + newval = static_cast(func(val, static_cast(old))); \ + old = atomicCAS(address_as_ui, assumed, newval); \ + } while (assumed != old); \ + } \ +}; + + +# define GPU_ATOMIC_INTEGER(NAME, OP, DTYPE) \ +static inline __device__ void gpuAtomic##NAME(DTYPE *address, DTYPE val) { \ +Atomic##NAME##IntegerImpl()(address, \ + val, \ + [](DTYPE a, DTYPE b) { \ + return OP; \ + }); \ +} \ + +ATOMIC_INTEGER_IMPL(Add) +GPU_ATOMIC_INTEGER(Add, a || b, bool) + +// Don't instantiate gpuAtomicAdd with the macro as it seems non-standard (see int32, int64) +static inline __device__ void gpuAtomicAdd(uint8_t *address, uint8_t val) { + AtomicAddIntegerImpl()(address, + val, + [](uint8_t a, uint8_t b) { + return a + b; + }); +} + +static inline __device__ void gpuAtomicAdd(int8_t *address, int8_t val) { + AtomicAddIntegerImpl()(address, + val, + [](int8_t a, int8_t b) { + return a + b; + }); +} + +static inline __device__ void gpuAtomicAdd(int16_t *address, int16_t val) { + AtomicAddIntegerImpl()(address, + val, + [](int16_t a, int16_t b) { + return a + b; + }); +} + +static inline __device__ int32_t gpuAtomicAdd(int32_t *address, int32_t val) { + return atomicAdd(address, val); +} + +static inline __device__ void gpuAtomicAdd(int64_t *address, int64_t val) { +#if defined(USE_ROCM) + __atomic_fetch_add(address, val, __ATOMIC_RELAXED); +#else + static_assert(sizeof(unsigned long long int) == sizeof(int64_t), "bitwidth change is not allowed"); + atomicAdd(reinterpret_cast(address), static_cast(val)); +#endif +} + +static inline __device__ at::Half gpuAtomicAdd(at::Half *address, at::Half val) { +#if defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) + return AtomicFPOp()(address, val, + [](at::Half hsum, at::Half val) { + return hsum + val; + }); +#else + return atomicAdd(reinterpret_cast<__half*>(address), val); +#endif +} + +static inline __device__ at::BFloat16 gpuAtomicAdd(at::BFloat16 *address, at::BFloat16 val) { +#if defined(USE_ROCM) || ((defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))) +return AtomicFPOp()(address, val, + [](at::BFloat16 bsum, at::BFloat16 val) { + return bsum + val; + }); +#else + __nv_bfloat16 r = atomicAdd(reinterpret_cast<__nv_bfloat16*>(address), *reinterpret_cast<__nv_bfloat16*>(&val)); + return *reinterpret_cast(&r); +#endif +} + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600) +// from CUDA C Programmic Guide +static inline __device__ double atomicAdd(double* address, double val) +#if defined(__clang__) && defined(__CUDA__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wgcc-compat" + __attribute__((enable_if(true, ""))) +#pragma GCC diagnostic pop +#endif +{ + + return AtomicFPOp()(address, val, + [](double val, unsigned long long int assumed) { + return __double_as_longlong(val + __longlong_as_double(assumed)); + }); +} +#elif defined(USE_ROCM) || !(defined(__CUDA_ARCH__)) + +/* Note [hip-clang differences to hcc] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * The upcoming hip-clang compiler for ROCm differs from hcc in a few details. + * It exports the __HIP__ macro, we can hence differentiate between hcc and + * hip-clang. In the below, hcc only received support for atomicAdd with double + * typing after work week 18312. hip-clang had support from the first version. + * In general, the code-visible differences between hip-clang and hcc will be + * minimal. + */ + +#if defined(USE_ROCM) && __hcc_workweek__ < 18312 && !__HIP__ + // This needs to be defined for the host side pass + static inline __device__ double atomicAdd(double *address, double val) { } +#endif +#endif + +static inline __device__ double gpuAtomicAdd(double *address, double val) { + return atomicAdd(address, val); +} + +static inline __device__ float gpuAtomicAdd(float *address, float val) { + return atomicAdd(address, val); +} + +template +static inline __device__ void gpuAtomicAdd(c10::complex *address, c10::complex val) { + gpuAtomicAdd(&address->real_, val.real_); + gpuAtomicAdd(&address->imag_, val.imag_); +} + +/* Note [gpuAtomicAdd vs atomicAdd] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * Some extensions such as torchvision call atomicAdd() + * directly and require non-library provided data type support. Only for these, we + * continue to provide atomicAdd overloads. + */ +static inline __device__ at::Half atomicAdd(at::Half *address, at::Half val) { + return gpuAtomicAdd(address, val); +} + +static inline __device__ at::BFloat16 atomicAdd(at::BFloat16 *address, at::BFloat16 val) { + return gpuAtomicAdd(address, val); +} + +static inline __device__ void atomicAdd(uint8_t *address, uint8_t val) { + gpuAtomicAdd(address, val); +} + +static inline __device__ void atomicAdd(int8_t *address, int8_t val) { + gpuAtomicAdd(address, val); +} + +static inline __device__ void atomicAdd(int16_t *address, int16_t val) { + gpuAtomicAdd(address, val); +} + +static inline __device__ void atomicAdd(int64_t *address, int64_t val) { + gpuAtomicAdd(address, val); +} + +static inline __device__ void atomicAdd(bool *address, bool val) { + gpuAtomicAdd(address, val); +} + +/* Note [explicitly non-returning atomics] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * AMD's MI100 (gfx908) provides an optimized fp32 atomicAdd, exposed via atomicAddNoRet(). + * Due to compiler limitations, callers must opt-in to guarantee the optimized instruction. + * This non-returning atomicAddNoRet cannot be used to implement the returning atomicAdd, + * therefore we need a new API 'gpuAtomicAddNoReturn'. + */ +template +static inline __device__ void gpuAtomicAddNoReturn(c10::complex *address, c10::complex val) { gpuAtomicAdd(address, val); } +static inline __device__ void gpuAtomicAddNoReturn(uint8_t *address, uint8_t val) { gpuAtomicAdd(address, val); } +static inline __device__ void gpuAtomicAddNoReturn(int8_t *address, int8_t val) { gpuAtomicAdd(address, val); } +static inline __device__ void gpuAtomicAddNoReturn(int16_t *address, int16_t val) { gpuAtomicAdd(address, val); } +static inline __device__ void gpuAtomicAddNoReturn(int32_t *address, int32_t val) { gpuAtomicAdd(address, val); } +static inline __device__ void gpuAtomicAddNoReturn(int64_t *address, int64_t val) { gpuAtomicAdd(address, val); } +static inline __device__ void gpuAtomicAddNoReturn(bool *address, bool val) { gpuAtomicAdd(address, val); } +static inline __device__ void gpuAtomicAddNoReturn(at::Half *address, at::Half val) { gpuAtomicAdd(address, val); } +static inline __device__ void gpuAtomicAddNoReturn(at::BFloat16 *address, at::BFloat16 val) { gpuAtomicAdd(address, val); } +static inline __device__ void gpuAtomicAddNoReturn(double *address, double val) { gpuAtomicAdd(address, val); } + +/* Special case fp32 atomic. */ +#if defined(USE_ROCM) +static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { atomicAddNoRet(address, val); } +#else +static inline __device__ void gpuAtomicAddNoReturn(float *address, float val) { gpuAtomicAdd(address, val); } +#endif + +// Atomic multiplication implementation. + +ATOMIC_INTEGER_IMPL(Mul) +GPU_ATOMIC_INTEGER(Mul, a * b, uint8_t) +GPU_ATOMIC_INTEGER(Mul, a * b, int8_t) +GPU_ATOMIC_INTEGER(Mul, a * b, int16_t) +GPU_ATOMIC_INTEGER(Mul, a * b, int32_t) +GPU_ATOMIC_INTEGER(Mul, a * b, int64_t) + +inline __device__ at::Half gpuAtomicMul(at::Half * address, at::Half val) { + return AtomicFPOp()(address, val, + [](at::Half bsum, at::Half val) { + return bsum * val; + }); +} + +inline __device__ at::BFloat16 gpuAtomicMul(at::BFloat16 * address, at::BFloat16 val) { + return AtomicFPOp()(address, val, + [](at::BFloat16 bsum, at::BFloat16 val) { + return bsum * val; + }); +} + +inline __device__ double gpuAtomicMul(double * address, double val) { + return AtomicFPOp()(address, val, + [](double val, unsigned long long int assumed) { + return __double_as_longlong(val * __longlong_as_double(assumed)); + }); +} + +// Dont use a templated function for this since the addition function defaults to the CUDA built-in. +inline __device__ float gpuAtomicMul (float * address, float val) { + unsigned int* address_as_ull = (unsigned int*)address; + unsigned int old = *address_as_ull; + unsigned int assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __float_as_int(val * + __int_as_float(assumed))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __int_as_float(old); +} + +// Atomic maximum implementation. + +template +__host__ __device__ T safe_max(T a, T b) { + #if defined(__HIPCC__) + // TODO: remove this special case for HIP when issue is fixed: + // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 + T max = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::max(a, b)); + #else + T max = at::_isnan(b) ? b : std::max(a, b); + #endif + + return max; +} + +ATOMIC_INTEGER_IMPL(Max) +GPU_ATOMIC_INTEGER(Max, safe_max(a, b), uint8_t) +GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int8_t) +GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int16_t) +GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int32_t) +GPU_ATOMIC_INTEGER(Max, safe_max(a, b), int64_t) + +inline __device__ at::Half gpuAtomicMax(at::Half * address, at::Half val) { + return AtomicFPOp()(address, val, + [](at::Half bsum, at::Half val) { + return safe_max(bsum, val); + }); +} + +inline __device__ at::BFloat16 gpuAtomicMax(at::BFloat16 * address, at::BFloat16 val) { + return AtomicFPOp()(address, val, + [](at::BFloat16 bsum, at::BFloat16 val) { + return safe_max(bsum, val); + }); +} + +inline __device__ double gpuAtomicMax(double * address, double val) { + return AtomicFPOp()(address, val, + [](double val, unsigned long long int assumed) { + return __double_as_longlong(safe_max(val, __longlong_as_double(assumed))); + }); +} + +// Dont use a templated function for this since the addition function defaults to the CUDA built-in. +inline __device__ float gpuAtomicMax(float * address, float val) { + unsigned int* address_as_ull = (unsigned int*)address; + unsigned int old = *address_as_ull; + unsigned int assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __float_as_int(safe_max(val, __int_as_float(assumed)))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __int_as_float(old); +} + +// Atomic minimum implementation. + +template +__host__ __device__ T safe_min(T a, T b) { + #if defined(__HIPCC__) + // TODO: remove this special case for HIP when issue is fixed: + // https://github.com/ROCm-Developer-Tools/HIP/issues/2209 + T min = at::_isnan(a) ? a : (at::_isnan(b) ? b : std::min(a, b)); + #else + T min = at::_isnan(b) ? b : std::min(a, b); + #endif + + return min; +} + +ATOMIC_INTEGER_IMPL(Min) +GPU_ATOMIC_INTEGER(Min, safe_min(a, b), uint8_t) +GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int8_t) +GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int16_t) +GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int32_t) +GPU_ATOMIC_INTEGER(Min, safe_min(a, b), int64_t) + +inline __device__ at::Half gpuAtomicMin(at::Half * address, at::Half val) { + return AtomicFPOp()(address, val, + [](at::Half bsum, at::Half val) { + return safe_min(bsum, val); + }); +} + +inline __device__ at::BFloat16 gpuAtomicMin(at::BFloat16 * address, at::BFloat16 val) { + return AtomicFPOp()(address, val, + [](at::BFloat16 bsum, at::BFloat16 val) { + return safe_min(bsum, val); + }); +} + +inline __device__ double gpuAtomicMin(double * address, double val) { + return AtomicFPOp()(address, val, + [](double val, unsigned long long int assumed) { + return __double_as_longlong(safe_min(val, __longlong_as_double(assumed))); + }); +} + +// Dont use a templated function for this since the addition function defaults to the CUDA built-in. +inline __device__ float gpuAtomicMin(float * address, float val) { + unsigned int* address_as_ull = (unsigned int*)address; + unsigned int old = *address_as_ull; + unsigned int assumed; + + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __float_as_int(safe_min(val, __int_as_float(assumed)))); + + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) + } while (assumed != old); + + return __int_as_float(old); +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAApplyUtils.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAApplyUtils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4dcdabf17b3b90ad598db48f7210e3932142aaad --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAApplyUtils.cuh @@ -0,0 +1,537 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +// +// This file contains pointwise operation functions and kernels that +// work on both contiguous and non-contiguous tensor arguments of +// arbitrary (up to MAX_CUTORCH_DIMS) dimensioned arguments without +// copying or temporary storage. +// + +/* + NOTE [ CUDA_tensor_applyN helpers ] + + The following CUDA_tensor_applyN (where N currently can be 1, 2, 3, or 4) + functions apply a pointwise operator to N tensor(s). + + The calling convention is + + 1. The template arguments should be, sequentially, + - First N typename args specify the scalar types of each of the N tensors. + - (Optional) `int step` arg specifies the number of elements processed + together at the same time. + Default is 1. + - A usually omitted (i.e., inferred) typename arg specifies the type of the + function/functor applied on `N * step` values in each iteration of each + CUDA thread. + 2. The arguments should be, sequentially, + - N tensors + - op: a function/functor that processes `N * step` values at the same time. + - If `step == 1`, it must have signature + `void(*)(scalar1_t&, scalar2_t&, ..., scalarN_t&)`, where + `scalar*_t`s are the first N typename template args, and the inputs + are the `N` values from the `N` tensors retrieved at a common index. + - Otherwise, it must must have signature + void(*)(int n, scalar1_t&, scalar1_t&, ..., scalar1_t&, // repeat `step` times + scalar2_t&, scalar2_t&, ..., scalar2_t&, // repeat `step` times + ..., + scalarN_t&, scalarN_t&, ..., scalarN_t&) // repeat `step` times + Different from `step == 1` case, it processes `N * step` values taken + from `step` common indices. Moreover, the first input `n` represents the + number of valid indices (it will always have `0 < n <= step`). It will + almost always be `step`, but at the boundary we may not have full `step` + elements and `n` can be a lesser value. + + E.g., if `step == 4` and `N == 2`, `op` could be + + [](int n, scalar1_t &u1, scalar1_t &u2, scalar1_t &u3, scalar1_t &u4, + scalar2_t &v1, scalar2_t &v2, scalar2_t &v3, scalar2_t &v4) { + // Only process u1, ..., un and v1, ..., vn. + // So if `n == 3`, `u4` and `v4` need not to be considered. + } + + In both cases, the references can actually be const, but at least one of + them should be non-const in order to write the output. + - (Optional, but recommended) N TensorArgType args that specify for each + tensor whether `op` reads AND writes ] (i.e., TensorArgType::ReadWrite), + or only reads (i.e., TensorArgType::ReadOnly). + Default is TensorArgType::ReadWrite for first Tensor, and + TensorArgType::ReadOnly for the rest. + + E.g., + + to compute a = b^2 for a and b of same dtype, we can call + + CUDA_tensor_apply2( + a, b, + [] __device__ (scalar &a_val, const scalar &b_val) { a_val = b_val * b_val; } + ); + + to work on 2 values at the same time, we can call + + CUDA_tensor_apply2( + a, b, + [] __device__ (int n, scalar1 &a_val1, scalar1 &a_val2, + const scalar2 &b_val1, const scalar2 &b_val2) { + // call special vectorized op here, or just do elementwise and enjoy unrolling... + // if n == 1, only process a_val1 and b_val1 + } + ); +*/ + +namespace at::cuda { + +// TODO: combine with TensorArg? So far that's been for debugging, and this is functional... +enum class TensorArgType { ReadWrite, ReadOnly }; + +namespace { + +// Rearrange dimensions for pointwise operations so that strides are in +// decreasing order as much as possible, so that kernels have better memory +// access patterns. +// +// For example, consider a binary operation on two "transposed" 2-dim tensors: +// sizes: 256 512 +// aInfo->strides: 1 256 +// bInfo->strides: 1 256 +// +// Given this, each concurrent memory access inside kernelPointwiseApply2() is +// exactly 256 elements apart, resulting in poor performance. +// +// This function exchanges dimensions so that memory access is contiguous: +// sizes: 512 256 +// aInfo->strides: 256 1 +// bInfo->strides: 256 1 +// +// (Actually, it becomes even better because now collapseDims() can turn each +// input into one contiguous array.) +// +// In general, given M (<=4) TensorInfo's with N dimensions, we can view each +// strides[i] (0 <= i < N) as an M-tuple. Given each pair i < j, we exchange +// strides[i] and [j] if +// (1) strides[i][k] < strides[j][k] for some k (0 <= k < M) +// (exchanging them will benefit input #k), and +// (2) strides[i][k] <= strieds[j][k] for all k +// (exchanging them will not make any input worse). +template +inline void rearrangeDims(detail::TensorInfo* aInfo, + detail::TensorInfo* bInfo = nullptr, + detail::TensorInfo* cInfo = nullptr, + detail::TensorInfo* dInfo = nullptr) { + int numInfos = 1; + int dims = aInfo->dims; + IndexType *sizes[4] = { aInfo->sizes, }; + IndexType *strides[4] = { aInfo->strides, }; + + if (bInfo != nullptr) { + ++numInfos; + if (bInfo->dims != dims) return; + sizes[1] = bInfo->sizes; + strides[1] = bInfo->strides; + } + + if (cInfo != nullptr) { + ++numInfos; + if (cInfo->dims != dims) return; + sizes[2] = cInfo->sizes; + strides[2] = cInfo->strides; + } + + if (dInfo != nullptr) { + ++numInfos; + if (dInfo->dims != dims) return; + sizes[3] = dInfo->sizes; + strides[3] = dInfo->strides; + } + + // Bail out if sizes do not match: we are using "deprecated pointwise + // behavior" among tensors of different shapes but same number of elements. + for (int i = 1; i < numInfos; ++i) { + for (int j = 0; j < dims; ++j) { + if (sizes[i][j] != sizes[0][j]) return; + } + } + + for (int i = 0; i < dims - 1; ++i) { + // No need to consider dimensions of size 1. + if (sizes[0][i] == 1) continue; + + for (int j = i + 1; j < dims; ++j) { + if (sizes[0][j] == 1) continue; + + // Compare the relative sizes of strides between dim #i and dim #j. + bool hasIncreasingStrides = false; + bool hasDecreasingStrides = false; + + for (int k = 0; k < numInfos; k++) { + IndexType stride_i = strides[k][i]; + IndexType stride_j = strides[k][j]; + if (stride_i < stride_j) { + hasIncreasingStrides = true; + } else if (stride_i > stride_j) { + hasDecreasingStrides = true; + } + } + + if (hasIncreasingStrides && !hasDecreasingStrides) { + for (int k = 0; k < numInfos; k++) { + IndexType size = sizes[k][i]; + sizes[k][i] = sizes[k][j]; + sizes[k][j] = size; + + IndexType stride = strides[k][i]; + strides[k][i] = strides[k][j]; + strides[k][j] = stride; + } + } + } + } +} + +// The `remaining_steps` argument is used to support Op that operates on +// multiple elements at the same time. Generally, the strategy of ApplyOpN is to +// 1. Initialize `remaining_steps = step`, where `step` is the template arg of +// CUDA_tensor_applyN helpers. The input arg `n` to `apply()` represents the +// number of elements in bound for this call. It will almost always equal to +// `step` except at boundaries. +// 2. If `remaining_steps > 0` convert the current linearIndex to offset (if in +// bound), and recursively call `ApplyOpN` with `remaining_steps - 1`. +// 3. At `remaining_steps = 0`, +// if `step = 1`, call `op(tensor1_val, tensor2_val, ...)`; +// if `step > 1`, call `op(n, tensor1_val1, tensor1_val2, ..., tesor1_valstep, +// tensor2_val1, tensor2_val2, ..., tesor2_valstep, +// ... +// tensorN_val1, tensorN_val2, ..., tesorN_valstep);` +// +// See NOTE [ CUDA_tensor_applyN helpers ] above for how Op may look like. + +template +struct ApplyOp1 { +__device__ __forceinline__ +static void apply(detail::TensorInfo &a, const Op &op, int n, + IndexType linearIndex, Offsets... aOffsets) { + // Convert `linearIndex` into an offset of `a` + const IndexType aOffset = sizeof...(Offsets) < n ? + detail::IndexToOffset::get(linearIndex, a) : 0; + + ApplyOp1::apply( + a, op, n, linearIndex + 1, aOffsets..., aOffset + ); +} +}; + +// Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`). +// We don't need to pass in how many elements need to processed in this case. +template +struct ApplyOp1 { +__device__ __forceinline__ +static void apply(detail::TensorInfo &a, const Op &op, + int n, IndexType linearIndex, Offset offset) { + op(a.data[offset]); +} +}; + +template +struct ApplyOp1 { +__device__ __forceinline__ +static void apply(detail::TensorInfo &a, const Op &op, int n, + IndexType linearIndex, Offsets... offsets) { + op(n, a.data[offsets]...); +} +}; + +template +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) +C10_LAUNCH_BOUNDS_2(AT_APPLY_THREADS_PER_BLOCK, AT_APPLY_BLOCKS_PER_SM) +#endif +__global__ void kernelPointwiseApply1(detail::TensorInfo a, + IndexType totalElements, const Op op) { + for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step; + linearIndex < totalElements; + linearIndex += gridDim.x * blockDim.x * step) { + ApplyOp1::apply( + a, op, ::min(step, static_cast(totalElements - linearIndex)), linearIndex); + } +} + + +template +struct ApplyOp2 { +__device__ __forceinline__ +static void apply(detail::TensorInfo &a, + detail::TensorInfo &b, + const Op &op, int64_t n, IndexType linearIndex, + Offsets... aOffsets, Offsets... bOffsets) { + // Convert `linearIndex` into an offset of `a` + const IndexType aOffset = static_cast(sizeof...(Offsets)) < n ? + detail::IndexToOffset::get(linearIndex, a) : 0; + + // Convert `linearIndex` into an offset of `b` + const IndexType bOffset = static_cast(sizeof...(Offsets)) < n ? + detail::IndexToOffset::get(linearIndex, b) : 0; + + ApplyOp2::apply( + a, b, op, n, linearIndex + 1, aOffsets..., aOffset, bOffsets..., bOffset + ); +} +}; + +// Specialize `step=1` case (i.e., `remaining_steps=0` and `len(Offsets)=1`). +// We don't need to pass in how many elements need to processed in this case. +template +struct ApplyOp2 { +__device__ __forceinline__ +static void apply(detail::TensorInfo &a, + detail::TensorInfo &b, + const Op &op, int /*n*/, IndexType /*linearIndex*/, + Offset aOffset, Offset bOffset) { + op(a.data[aOffset], b.data[bOffset]); +} +}; + +template +struct ApplyOp2 { +__device__ __forceinline__ +static void apply(detail::TensorInfo &a, + detail::TensorInfo &b, + const Op &op, int n, IndexType linearIndex, + Offsets... aOffsets, Offsets... bOffsets) { + op(n, a.data[aOffsets]..., b.data[bOffsets]...); +} +}; + +template +#if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) +C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) +#endif +__global__ void +kernelPointwiseApply2(detail::TensorInfo a, + detail::TensorInfo b, + IndexType totalElements, + const Op op) { + for (IndexType linearIndex = (blockIdx.x * blockDim.x + threadIdx.x) * step; + linearIndex < totalElements; + linearIndex += gridDim.x * blockDim.x * step) { + ApplyOp2::apply( + a, b, op, ::min(step, static_cast(totalElements - linearIndex)), + linearIndex); + } +} + +} // anonymous namespace + +template +inline bool CUDA_tensor_apply2(at::TensorBase a, + at::TensorBase b, + const Op op, + TensorArgType aType = TensorArgType::ReadWrite, + TensorArgType bType = TensorArgType::ReadOnly) { + TORCH_CHECK(a.device().is_cuda() && b.device().is_cuda(), + "CUDA_tensor_apply2: Expected tensors to have CUDA DeviceType, but got " + "tensors with type ", a.device().type(), " and ", b.device().type()); + int64_t totalElements = a.numel(); + + if (totalElements != b.numel()) { + return false; + } + + if (a.dim() > MAX_TENSORINFO_DIMS || + b.dim() > MAX_TENSORINFO_DIMS) { + return false; + } + + if (a.numel() == 0) { + // Empty tensor; do nothing + return true; + } + const dim3 block = getApplyBlock(max_threads_per_block); + + dim3 grid; + auto curDevice = current_device(); + if (curDevice == -1) return false; + if (!getApplyGrid(totalElements, grid, curDevice, max_threads_per_block)) { + return false; + } + + /* + Expands readable/writable tensors whose indices may be "overlapped." + This ensures that each element of the tensor is operated on once and only + once. + */ + TensorBase oldA; + TensorBase oldB; + + if (aType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(a)) { + // Must perform in contiguous space + oldA = std::exchange(a, a.contiguous()); + } + if (bType == TensorArgType::ReadWrite && detail::maybeOverlappingIndices(b)) { + // Must perform in contiguous space + oldB = std::exchange(b, b.contiguous()); + } + + // It is possible that the tensor dimensions are able to be collapsed, + // and thus we can reduce the actual code complexity of the copy by + // exploiting this knowledge statically, since the div/mod is the + // most expensive part of the operation, more so than memory accesses. + // For instance, when copying a non-contiguous to a contiguous tensor + // (or vice versa), the contiguous tensor can be collapsed to one + // dimension, and the loop to translate the linear index to the array + // index can be similarly collapsed. That is what this unrolling is for. + +#define HANDLE_CASE(TYPE, A, B) \ + kernelPointwiseApply2 \ + <<>>( \ + aInfo, bInfo, static_cast(totalElements), op); \ + C10_CUDA_KERNEL_LAUNCH_CHECK(); + +#define HANDLE_B_CASE(TYPE, A, B) { \ + switch (B) { \ + case 1: \ + HANDLE_CASE(TYPE, A, 1); \ + break; \ + case 2: \ + HANDLE_CASE(TYPE, A, 2); \ + break; \ + default: \ + HANDLE_CASE(TYPE, A, -1); \ + break; \ + } \ +} + +#define HANDLE_A_CASE(TYPE, A, B) { \ + switch (A) { \ + case 1: \ + HANDLE_B_CASE(TYPE, 1, B); \ + break; \ + case 2: \ + HANDLE_B_CASE(TYPE, 2, B); \ + break; \ + default: \ + HANDLE_B_CASE(TYPE, -1, B); \ + break; \ + } \ +} + + if (detail::canUse32BitIndexMath(a) && + detail::canUse32BitIndexMath(b)) { + detail::TensorInfo aInfo = + detail::getTensorInfo(a); + + detail::TensorInfo bInfo = + detail::getTensorInfo(b); + rearrangeDims(&aInfo, &bInfo); + aInfo.collapseDims(); + bInfo.collapseDims(); + + HANDLE_A_CASE(unsigned int, aInfo.dims, bInfo.dims); + } else { + detail::TensorInfo aInfo = + detail::getTensorInfo(a); + + detail::TensorInfo bInfo = + detail::getTensorInfo(b); + rearrangeDims(&aInfo, &bInfo); + aInfo.collapseDims(); + bInfo.collapseDims(); + + /* + Only instantiates the all 1D special case and the fallback all nD case for + large (64-bit indexed) tensors to reduce compilation time. + */ + if (aInfo.dims == 1 && bInfo.dims == 1) { + HANDLE_CASE(uint64_t, 1, 1); + } else { + HANDLE_CASE(uint64_t, -1, -1); + } + } +#undef HANDLE_CASE +#undef HANDLE_B_CASE +#undef HANDLE_A_CASE + + if (oldA.defined()) { + at::native::copy_ignoring_overlaps(oldA, a); + } + + if (oldB.defined()) { + at::native::copy_ignoring_overlaps(oldB, b); + } + + return true; +} + +/* Provides default step = 1 to CUDA_tensor_apply2. */ +template +inline bool CUDA_tensor_apply2(const at::TensorBase &a, + const at::TensorBase &b, + const Op op, + TensorArgType aType = TensorArgType::ReadWrite, + TensorArgType bType = TensorArgType::ReadOnly) { + return CUDA_tensor_apply2(a, b, op, aType, bType); +} + +} // namespace at::cuda diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAGeneratorImpl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAGeneratorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..2fe8a6f6c8f4fe9ac6a8bc8f78c145ae48e15c9c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAGeneratorImpl.h @@ -0,0 +1,138 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace at { +/** + * Note [CUDA Graph-safe RNG states] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * Strategy: + * ~~~~~~~~~ + * (It helps to look at + * cuda/detail/PhiloxCudaStateRaw.cuh and + * cuda/detail/UnpackRaw.cuh + * while you read this.) + * + * A CUDA graph containing multiple RNG ops behaves like a + * single giant kernel from the perspective of ops external + * to the graph. During graph capture, logic in CUDAGeneratorImpl + * records the total of all offset increments that occur in the + * graphed region, and records the final total as the offset for + * the entire graph. + * + * When the graph reruns, the logic that reruns it + * increments this device's CUDA generator's offset + * by that total. + * + * Meanwhile, within the graph, at capture time, instead of + * populating PhiloxCudaStates with the uint64_t offset pulled + * directly from the global state, PhiloxCudaState uses a pointer + * to a one-element stream-local int64_t device tensor + * holding an initial offset value, and a uint64_t holding an + * intra-graph offset. (The intra-graph offset starts from zero + * when capture begins.) In each consumer kernel, + * at::cuda::philox::unpack computes the offset to use for this kernel + * as intra-graph offset + *initial offset. + * + * When the graph reruns, the logic that reruns it first + * fill_s the initial offset tensor with this device's + * CUDA generator's current offset. + * + * The control flow above ensures graphed execution is bitwise + * identical to eager execution as long as RNG ops are enqueued + * from a single thread, even if RNG ops and graphs containing + * RNG ops are enqueued and run simultaneously on multiple streams. + * + * Usage: + * ~~~~~~ + * PhiloxCudaState in this file, and unpack() in + * cuda/CUDAGraphsUtils.cuh allow non-divergent use of + * CUDAGeneratorImpl whether graph capture is underway or not. + * + * Each PhiloxCudaState instance should be used for one and only one + * consumer kernel. + * + * Example (see e.g. native/cuda/Dropout.cu): + * + * #include + * #include + * + * __global__ void kernel(..., PhiloxCudaState philox_args) { + * auto seeds = at::cuda::philox::unpack(philox_args); + * IndexType idx = blockIdx.x * blockDim.x + threadIdx.x; + * curandStatePhilox4_32_10_t state; + * curand_init(std::get<0>(seeds), // seed + * idx, // per-thread subsequence + * std::get<1>(seeds), // offset in subsequence + * &state); + * ... + * } + * + * host_caller(...) { + * PhiloxCudaState rng_engine_inputs; + * { + * // See Note [Acquire lock when using random generators] + * std::lock_guard lock(gen->mutex_); + * + * // gen could be HostState or DevState here! No divergent code needed! + * rng_engine_inputs = gen->philox_cuda_state(offset_increment); + * } + * kernel<<<...>>>(..., rng_engine_inputs); + * } + * + */ + +struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl { + // Constructors + CUDAGeneratorImpl(DeviceIndex device_index = -1); + ~CUDAGeneratorImpl() override = default; + + // CUDAGeneratorImpl methods + std::shared_ptr clone() const; + void set_current_seed(uint64_t seed) override; + void set_offset(uint64_t offset) override; + uint64_t get_offset() const override; + uint64_t current_seed() const override; + uint64_t seed() override; + void set_state(const c10::TensorImpl& new_state) override; + c10::intrusive_ptr get_state() const override; + void set_philox_offset_per_thread(uint64_t offset); + uint64_t philox_offset_per_thread() const; + void capture_prologue(int64_t* seed_extragraph, int64_t* offset_extragraph); + uint64_t capture_epilogue(); + PhiloxCudaState philox_cuda_state(uint64_t increment); + + bool reset_rnn_state() { + return !no_reset_rnn_state_.test_and_set(); + } + + // Temporarily accommodates call sites that use philox_engine_inputs. + // Allows incremental refactor of call sites to use philox_cuda_state. + std::pair philox_engine_inputs(uint64_t increment); + + static c10::DeviceType device_type(); + +private: + CUDAGeneratorImpl* clone_impl() const override; + uint64_t seed_ = default_rng_seed_val; + uint64_t philox_offset_per_thread_ = 0; + int64_t* seed_extragraph_{}; + int64_t* offset_extragraph_{}; + uint32_t offset_intragraph_ = 0; + bool graph_expects_this_gen_ = false; + std::atomic_flag no_reset_rnn_state_; +}; + +namespace cuda::detail { + +TORCH_CUDA_CPP_API const Generator& getDefaultCUDAGenerator( + DeviceIndex device_index = -1); +TORCH_CUDA_CPP_API Generator createCUDAGenerator(DeviceIndex device_index = -1); + +} // namespace cuda::detail +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAGraph.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAGraph.h new file mode 100644 index 0000000000000000000000000000000000000000..b395de9a252a7e1656340b572cc110113686b207 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAGraph.h @@ -0,0 +1,92 @@ +#pragma once + +#include +#include +#include +#include + +#include + +namespace at { + +struct CUDAGeneratorImpl; + +namespace cuda { + +// Standalone way to get a unique mempool id usable as a pool=... argument +// to CUDAGraph::capture_begin +TORCH_CUDA_CPP_API MempoolId_t graph_pool_handle(); + +struct TORCH_CUDA_CPP_API CUDAGraph { + CUDAGraph(); + ~CUDAGraph(); + + static void inc_pending_event_queries(); + static void dec_pending_event_queries(); + static int num_pending_event_queries(); + void capture_begin(MempoolId_t pool={0, 0}, cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal); + void capture_end(); + void replay(); + void reset(); + MempoolId_t pool(); + void enable_debug_mode(); + void debug_dump(const std::string& debug_path); + + protected: +#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 + cudaGraph_t graph_ = NULL; + cudaGraphExec_t graph_exec_ = NULL; +#endif + + static std::atomic pending_event_queries; + + // internal states so reset() can do its best cleaning up + // Set to true in capture_end if cudaStreamEndCapture succeeded + // Set back to false soon after, when graph_ is consumed by cudaGraphInstantiate + // to create graph_exec_, then graph_ is deleted + bool has_graph_ = false; + // Set to true in capture_end if cudaGraphInstantiate succeeded + bool has_graph_exec_ = false; + + // uuid of this instance's current capture, used to + // specify the pool. + CaptureId_t id_; + + // the ID assigned by cuda during graph capture, + // used to identify when a stream is participating in capture + CaptureId_t capture_id_ = -1; + + // uuid used to request a particular private mempool from CUDACachingAllocator. + // By default, this will be set to {id_, 0}. + // + // If capture_begin is called with "pool=other_graph.pool()", this graph's mempool_id_ + // will be set to the other graph's mempool_id_, and therefore share a mempool with the + // other graph. + // + // If capture_begin is called with "pool=handle" where "handle" came from graph_pool_handle(), + // it will share a mempool with any other captures that used "pool=handle". + // + // Sharing a mempool across graphs saves memory, and it's safe if you + // know you'll replay those graphs in the same order you captured them. + MempoolId_t mempool_id_; + + // Stream on which capture began + at::cuda::CUDAStream capture_stream_; + + // Default generator on device where capture began + at::CUDAGeneratorImpl* capture_gen_; + + // Device where capture occurred. Right now, for simplicity, we require all ops + // in a capture to run on the same device, but this is a limitation of CUDAGraph, + // not CUDA itself. We can straightforwardly modify CUDAGraph to support multi-device + // captures if needed. + int capture_dev_; + + // RNG state trackers + at::Tensor seed_extragraph_; + at::Tensor offset_extragraph_; + uint64_t wholegraph_increment_; +}; + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAGraphsUtils.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAGraphsUtils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..f3e27a7b159ac6029fc85fd91a048202dfb352e1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDAGraphsUtils.cuh @@ -0,0 +1,57 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +// c10/cuda/CUDAGraphsC10Utils.h has utils used by both c10 and aten. +// This file adds utils used by aten only. + +namespace at::cuda { + +using CaptureId_t = c10::cuda::CaptureId_t; +using CaptureStatus = c10::cuda::CaptureStatus; + +// Use this version where you don't want to create a CUDA context if none exists. +inline CaptureStatus currentStreamCaptureStatus() { +#if !defined(USE_ROCM) || ROCM_VERSION >= 50300 + // don't create a context if we don't have to + if (c10::cuda::hasPrimaryContext(c10::cuda::current_device())) { + return c10::cuda::currentStreamCaptureStatusMayInitCtx(); + } else { + return CaptureStatus::None; + } +#else + return CaptureStatus::None; +#endif +} + +inline void assertNotCapturing(std::string attempt) { + auto status = currentStreamCaptureStatus(); + TORCH_CHECK(status == CaptureStatus::None, + attempt, + " during CUDA graph capture. If you need this call to be captured, " + "please file an issue. " + "Current cudaStreamCaptureStatus: ", + status); +} + +inline void errorIfCapturingCudnnBenchmark(std::string version_specific) { + auto status = currentStreamCaptureStatus(); + TORCH_CHECK(status == CaptureStatus::None, + "Current cudaStreamCaptureStatus: ", + status, + "\nCapturing ", + version_specific, + "is prohibited. Possible causes of this error:\n" + "1. No warmup iterations occurred before capture.\n" + "2. The convolutions you're trying to capture use dynamic shapes, " + "in which case capturing them is generally prohibited."); +} + +} // namespace at::cuda diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDASparseBlas.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDASparseBlas.h new file mode 100644 index 0000000000000000000000000000000000000000..c99d42c9a7de8852d84167bbca70bc242aabb2b2 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDASparseBlas.h @@ -0,0 +1,318 @@ +#pragma once + +/* + Provides a subset of cuSPARSE functions as templates: + + csrgeam2(...) + + where scalar_t is double, float, c10::complex or c10::complex. + The functions are available in at::cuda::sparse namespace. +*/ + +#include +#include + +namespace at::cuda::sparse { + +#define CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(scalar_t) \ + cusparseHandle_t handle, int m, int n, const scalar_t *alpha, \ + const cusparseMatDescr_t descrA, int nnzA, \ + const scalar_t *csrSortedValA, const int *csrSortedRowPtrA, \ + const int *csrSortedColIndA, const scalar_t *beta, \ + const cusparseMatDescr_t descrB, int nnzB, \ + const scalar_t *csrSortedValB, const int *csrSortedRowPtrB, \ + const int *csrSortedColIndB, const cusparseMatDescr_t descrC, \ + const scalar_t *csrSortedValC, const int *csrSortedRowPtrC, \ + const int *csrSortedColIndC, size_t *pBufferSizeInBytes + +template +inline void csrgeam2_bufferSizeExt( + CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(scalar_t)) { + TORCH_INTERNAL_ASSERT( + false, + "at::cuda::sparse::csrgeam2_bufferSizeExt: not implemented for ", + typeid(scalar_t).name()); +} + +template <> +void csrgeam2_bufferSizeExt( + CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(float)); +template <> +void csrgeam2_bufferSizeExt( + CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(double)); +template <> +void csrgeam2_bufferSizeExt>( + CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex)); +template <> +void csrgeam2_bufferSizeExt>( + CUSPARSE_CSRGEAM2_BUFFERSIZE_ARGTYPES(c10::complex)); + +#define CUSPARSE_CSRGEAM2_NNZ_ARGTYPES() \ + cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA, \ + int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA, \ + const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB, \ + const int *csrSortedColIndB, const cusparseMatDescr_t descrC, \ + int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace + +template +inline void csrgeam2Nnz(CUSPARSE_CSRGEAM2_NNZ_ARGTYPES()) { + TORCH_CUDASPARSE_CHECK(cusparseXcsrgeam2Nnz( + handle, + m, + n, + descrA, + nnzA, + csrSortedRowPtrA, + csrSortedColIndA, + descrB, + nnzB, + csrSortedRowPtrB, + csrSortedColIndB, + descrC, + csrSortedRowPtrC, + nnzTotalDevHostPtr, + workspace)); +} + +#define CUSPARSE_CSRGEAM2_ARGTYPES(scalar_t) \ + cusparseHandle_t handle, int m, int n, const scalar_t *alpha, \ + const cusparseMatDescr_t descrA, int nnzA, \ + const scalar_t *csrSortedValA, const int *csrSortedRowPtrA, \ + const int *csrSortedColIndA, const scalar_t *beta, \ + const cusparseMatDescr_t descrB, int nnzB, \ + const scalar_t *csrSortedValB, const int *csrSortedRowPtrB, \ + const int *csrSortedColIndB, const cusparseMatDescr_t descrC, \ + scalar_t *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC, \ + void *pBuffer + +template +inline void csrgeam2(CUSPARSE_CSRGEAM2_ARGTYPES(scalar_t)) { + TORCH_INTERNAL_ASSERT( + false, + "at::cuda::sparse::csrgeam2: not implemented for ", + typeid(scalar_t).name()); +} + +template <> +void csrgeam2(CUSPARSE_CSRGEAM2_ARGTYPES(float)); +template <> +void csrgeam2(CUSPARSE_CSRGEAM2_ARGTYPES(double)); +template <> +void csrgeam2>( + CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex)); +template <> +void csrgeam2>( + CUSPARSE_CSRGEAM2_ARGTYPES(c10::complex)); + +#define CUSPARSE_BSRMM_ARGTYPES(scalar_t) \ + cusparseHandle_t handle, cusparseDirection_t dirA, \ + cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n, \ + int kb, int nnzb, const scalar_t *alpha, \ + const cusparseMatDescr_t descrA, const scalar_t *bsrValA, \ + const int *bsrRowPtrA, const int *bsrColIndA, int blockDim, \ + const scalar_t *B, int ldb, const scalar_t *beta, scalar_t *C, int ldc + +template +inline void bsrmm(CUSPARSE_BSRMM_ARGTYPES(scalar_t)) { + TORCH_INTERNAL_ASSERT( + false, + "at::cuda::sparse::bsrmm: not implemented for ", + typeid(scalar_t).name()); +} + +template <> +void bsrmm(CUSPARSE_BSRMM_ARGTYPES(float)); +template <> +void bsrmm(CUSPARSE_BSRMM_ARGTYPES(double)); +template <> +void bsrmm>(CUSPARSE_BSRMM_ARGTYPES(c10::complex)); +template <> +void bsrmm>(CUSPARSE_BSRMM_ARGTYPES(c10::complex)); + +#define CUSPARSE_BSRMV_ARGTYPES(scalar_t) \ + cusparseHandle_t handle, cusparseDirection_t dirA, \ + cusparseOperation_t transA, int mb, int nb, int nnzb, \ + const scalar_t *alpha, const cusparseMatDescr_t descrA, \ + const scalar_t *bsrValA, const int *bsrRowPtrA, const int *bsrColIndA, \ + int blockDim, const scalar_t *x, const scalar_t *beta, scalar_t *y + +template +inline void bsrmv(CUSPARSE_BSRMV_ARGTYPES(scalar_t)) { + TORCH_INTERNAL_ASSERT( + false, + "at::cuda::sparse::bsrmv: not implemented for ", + typeid(scalar_t).name()); +} + +template <> +void bsrmv(CUSPARSE_BSRMV_ARGTYPES(float)); +template <> +void bsrmv(CUSPARSE_BSRMV_ARGTYPES(double)); +template <> +void bsrmv>(CUSPARSE_BSRMV_ARGTYPES(c10::complex)); +template <> +void bsrmv>(CUSPARSE_BSRMV_ARGTYPES(c10::complex)); + +#if AT_USE_HIPSPARSE_TRIANGULAR_SOLVE() + +#define CUSPARSE_BSRSV2_BUFFER_ARGTYPES(scalar_t) \ + cusparseHandle_t handle, cusparseDirection_t dirA, \ + cusparseOperation_t transA, int mb, int nnzb, \ + const cusparseMatDescr_t descrA, scalar_t *bsrValA, \ + const int *bsrRowPtrA, const int *bsrColIndA, int blockDim, \ + bsrsv2Info_t info, int *pBufferSizeInBytes + +template +inline void bsrsv2_bufferSize(CUSPARSE_BSRSV2_BUFFER_ARGTYPES(scalar_t)) { + TORCH_INTERNAL_ASSERT( + false, + "at::cuda::sparse::bsrsv2_bufferSize: not implemented for ", + typeid(scalar_t).name()); +} + +template <> +void bsrsv2_bufferSize(CUSPARSE_BSRSV2_BUFFER_ARGTYPES(float)); +template <> +void bsrsv2_bufferSize(CUSPARSE_BSRSV2_BUFFER_ARGTYPES(double)); +template <> +void bsrsv2_bufferSize>( + CUSPARSE_BSRSV2_BUFFER_ARGTYPES(c10::complex)); +template <> +void bsrsv2_bufferSize>( + CUSPARSE_BSRSV2_BUFFER_ARGTYPES(c10::complex)); + +#define CUSPARSE_BSRSV2_ANALYSIS_ARGTYPES(scalar_t) \ + cusparseHandle_t handle, cusparseDirection_t dirA, \ + cusparseOperation_t transA, int mb, int nnzb, \ + const cusparseMatDescr_t descrA, const scalar_t *bsrValA, \ + const int *bsrRowPtrA, const int *bsrColIndA, int blockDim, \ + bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer + +template +inline void bsrsv2_analysis(CUSPARSE_BSRSV2_ANALYSIS_ARGTYPES(scalar_t)) { + TORCH_INTERNAL_ASSERT( + false, + "at::cuda::sparse::bsrsv2_analysis: not implemented for ", + typeid(scalar_t).name()); +} + +template <> +void bsrsv2_analysis(CUSPARSE_BSRSV2_ANALYSIS_ARGTYPES(float)); +template <> +void bsrsv2_analysis(CUSPARSE_BSRSV2_ANALYSIS_ARGTYPES(double)); +template <> +void bsrsv2_analysis>( + CUSPARSE_BSRSV2_ANALYSIS_ARGTYPES(c10::complex)); +template <> +void bsrsv2_analysis>( + CUSPARSE_BSRSV2_ANALYSIS_ARGTYPES(c10::complex)); + +#define CUSPARSE_BSRSV2_SOLVE_ARGTYPES(scalar_t) \ + cusparseHandle_t handle, cusparseDirection_t dirA, \ + cusparseOperation_t transA, int mb, int nnzb, const scalar_t *alpha, \ + const cusparseMatDescr_t descrA, const scalar_t *bsrValA, \ + const int *bsrRowPtrA, const int *bsrColIndA, int blockDim, \ + bsrsv2Info_t info, const scalar_t *x, scalar_t *y, \ + cusparseSolvePolicy_t policy, void *pBuffer + +template +inline void bsrsv2_solve(CUSPARSE_BSRSV2_SOLVE_ARGTYPES(scalar_t)) { + TORCH_INTERNAL_ASSERT( + false, + "at::cuda::sparse::bsrsv2_solve: not implemented for ", + typeid(scalar_t).name()); +} + +template <> +void bsrsv2_solve(CUSPARSE_BSRSV2_SOLVE_ARGTYPES(float)); +template <> +void bsrsv2_solve(CUSPARSE_BSRSV2_SOLVE_ARGTYPES(double)); +template <> +void bsrsv2_solve>( + CUSPARSE_BSRSV2_SOLVE_ARGTYPES(c10::complex)); +template <> +void bsrsv2_solve>( + CUSPARSE_BSRSV2_SOLVE_ARGTYPES(c10::complex)); + +#define CUSPARSE_BSRSM2_BUFFER_ARGTYPES(scalar_t) \ + cusparseHandle_t handle, cusparseDirection_t dirA, \ + cusparseOperation_t transA, cusparseOperation_t transX, int mb, int n, \ + int nnzb, const cusparseMatDescr_t descrA, scalar_t *bsrValA, \ + const int *bsrRowPtrA, const int *bsrColIndA, int blockDim, \ + bsrsm2Info_t info, int *pBufferSizeInBytes + +template +inline void bsrsm2_bufferSize(CUSPARSE_BSRSM2_BUFFER_ARGTYPES(scalar_t)) { + TORCH_INTERNAL_ASSERT( + false, + "at::cuda::sparse::bsrsm2_bufferSize: not implemented for ", + typeid(scalar_t).name()); +} + +template <> +void bsrsm2_bufferSize(CUSPARSE_BSRSM2_BUFFER_ARGTYPES(float)); +template <> +void bsrsm2_bufferSize(CUSPARSE_BSRSM2_BUFFER_ARGTYPES(double)); +template <> +void bsrsm2_bufferSize>( + CUSPARSE_BSRSM2_BUFFER_ARGTYPES(c10::complex)); +template <> +void bsrsm2_bufferSize>( + CUSPARSE_BSRSM2_BUFFER_ARGTYPES(c10::complex)); + +#define CUSPARSE_BSRSM2_ANALYSIS_ARGTYPES(scalar_t) \ + cusparseHandle_t handle, cusparseDirection_t dirA, \ + cusparseOperation_t transA, cusparseOperation_t transX, int mb, int n, \ + int nnzb, const cusparseMatDescr_t descrA, const scalar_t *bsrValA, \ + const int *bsrRowPtrA, const int *bsrColIndA, int blockDim, \ + bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer + +template +inline void bsrsm2_analysis(CUSPARSE_BSRSM2_ANALYSIS_ARGTYPES(scalar_t)) { + TORCH_INTERNAL_ASSERT( + false, + "at::cuda::sparse::bsrsm2_analysis: not implemented for ", + typeid(scalar_t).name()); +} + +template <> +void bsrsm2_analysis(CUSPARSE_BSRSM2_ANALYSIS_ARGTYPES(float)); +template <> +void bsrsm2_analysis(CUSPARSE_BSRSM2_ANALYSIS_ARGTYPES(double)); +template <> +void bsrsm2_analysis>( + CUSPARSE_BSRSM2_ANALYSIS_ARGTYPES(c10::complex)); +template <> +void bsrsm2_analysis>( + CUSPARSE_BSRSM2_ANALYSIS_ARGTYPES(c10::complex)); + +#define CUSPARSE_BSRSM2_SOLVE_ARGTYPES(scalar_t) \ + cusparseHandle_t handle, cusparseDirection_t dirA, \ + cusparseOperation_t transA, cusparseOperation_t transX, int mb, int n, \ + int nnzb, const scalar_t *alpha, const cusparseMatDescr_t descrA, \ + const scalar_t *bsrValA, const int *bsrRowPtrA, const int *bsrColIndA, \ + int blockDim, bsrsm2Info_t info, const scalar_t *B, int ldb, \ + scalar_t *X, int ldx, cusparseSolvePolicy_t policy, void *pBuffer + +template +inline void bsrsm2_solve(CUSPARSE_BSRSM2_SOLVE_ARGTYPES(scalar_t)) { + TORCH_INTERNAL_ASSERT( + false, + "at::cuda::sparse::bsrsm2_solve: not implemented for ", + typeid(scalar_t).name()); +} + +template <> +void bsrsm2_solve(CUSPARSE_BSRSM2_SOLVE_ARGTYPES(float)); +template <> +void bsrsm2_solve(CUSPARSE_BSRSM2_SOLVE_ARGTYPES(double)); +template <> +void bsrsm2_solve>( + CUSPARSE_BSRSM2_SOLVE_ARGTYPES(c10::complex)); +template <> +void bsrsm2_solve>( + CUSPARSE_BSRSM2_SOLVE_ARGTYPES(c10::complex)); + +#endif // AT_USE_HIPSPARSE_TRIANGULAR_SOLVE + +} // namespace at::cuda::sparse diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDATensorMethods.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDATensorMethods.cuh new file mode 100644 index 0000000000000000000000000000000000000000..e4e89ea1cdb77da1d7866ffe99c64dabfd735d27 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/CUDATensorMethods.cuh @@ -0,0 +1,15 @@ +#pragma once + +#include +#include + +#include +#include +#include + +namespace at { +template <> +inline __half* Tensor::data() const { + return reinterpret_cast<__half*>(data()); +} +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/NumericLimits.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/NumericLimits.cuh new file mode 100644 index 0000000000000000000000000000000000000000..7081e94837caa7d5050128e0bfe19aa67f93cd39 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/NumericLimits.cuh @@ -0,0 +1,121 @@ +#pragma once + +#include +#include +#include +#include + +// NumericLimits.cuh is a holder for numeric limits definitions of commonly used +// types. This header is very specific to ROCm HIP and may be removed in the future. +// This header is derived from the legacy THCNumerics.cuh. + +// The lower_bound and upper_bound constants are same as lowest and max for +// integral types, but are -inf and +inf for floating point types. They are +// useful in implementing min, max, etc. + +namespace at { + +template +struct numeric_limits { +}; + +// WARNING: the following at::numeric_limits definitions are there only to support +// HIP compilation for the moment. Use std::numeric_limits if you are not +// compiling for ROCm. +// from @colesbury: "The functions on numeric_limits aren't marked with +// __device__ which is why they don't work with ROCm. CUDA allows them +// because they're constexpr." + +namespace { + // ROCm doesn't like INFINITY too. + constexpr double inf = INFINITY; +} + +template <> +struct numeric_limits { + static inline __host__ __device__ bool lowest() { return false; } + static inline __host__ __device__ bool max() { return true; } + static inline __host__ __device__ bool lower_bound() { return false; } + static inline __host__ __device__ bool upper_bound() { return true; } +}; + +template <> +struct numeric_limits { + static inline __host__ __device__ uint8_t lowest() { return 0; } + static inline __host__ __device__ uint8_t max() { return UINT8_MAX; } + static inline __host__ __device__ uint8_t lower_bound() { return 0; } + static inline __host__ __device__ uint8_t upper_bound() { return UINT8_MAX; } +}; + +template <> +struct numeric_limits { + static inline __host__ __device__ int8_t lowest() { return INT8_MIN; } + static inline __host__ __device__ int8_t max() { return INT8_MAX; } + static inline __host__ __device__ int8_t lower_bound() { return INT8_MIN; } + static inline __host__ __device__ int8_t upper_bound() { return INT8_MAX; } +}; + +template <> +struct numeric_limits { + static inline __host__ __device__ int16_t lowest() { return INT16_MIN; } + static inline __host__ __device__ int16_t max() { return INT16_MAX; } + static inline __host__ __device__ int16_t lower_bound() { return INT16_MIN; } + static inline __host__ __device__ int16_t upper_bound() { return INT16_MAX; } +}; + +template <> +struct numeric_limits { + static inline __host__ __device__ int32_t lowest() { return INT32_MIN; } + static inline __host__ __device__ int32_t max() { return INT32_MAX; } + static inline __host__ __device__ int32_t lower_bound() { return INT32_MIN; } + static inline __host__ __device__ int32_t upper_bound() { return INT32_MAX; } +}; + +template <> +struct numeric_limits { +#ifdef _MSC_VER + static inline __host__ __device__ int64_t lowest() { return _I64_MIN; } + static inline __host__ __device__ int64_t max() { return _I64_MAX; } + static inline __host__ __device__ int64_t lower_bound() { return _I64_MIN; } + static inline __host__ __device__ int64_t upper_bound() { return _I64_MAX; } +#else + static inline __host__ __device__ int64_t lowest() { return INT64_MIN; } + static inline __host__ __device__ int64_t max() { return INT64_MAX; } + static inline __host__ __device__ int64_t lower_bound() { return INT64_MIN; } + static inline __host__ __device__ int64_t upper_bound() { return INT64_MAX; } +#endif +}; + +template <> +struct numeric_limits { + static inline __host__ __device__ at::Half lowest() { return at::Half(0xFBFF, at::Half::from_bits()); } + static inline __host__ __device__ at::Half max() { return at::Half(0x7BFF, at::Half::from_bits()); } + static inline __host__ __device__ at::Half lower_bound() { return at::Half(0xFC00, at::Half::from_bits()); } + static inline __host__ __device__ at::Half upper_bound() { return at::Half(0x7C00, at::Half::from_bits()); } +}; + +template <> +struct numeric_limits { + static inline __host__ __device__ at::BFloat16 lowest() { return at::BFloat16(0xFF7F, at::BFloat16::from_bits()); } + static inline __host__ __device__ at::BFloat16 max() { return at::BFloat16(0x7F7F, at::BFloat16::from_bits()); } + static inline __host__ __device__ at::BFloat16 lower_bound() { return at::BFloat16(0xFF80, at::BFloat16::from_bits()); } + static inline __host__ __device__ at::BFloat16 upper_bound() { return at::BFloat16(0x7F80, at::BFloat16::from_bits()); } +}; + +template <> +struct numeric_limits { + static inline __host__ __device__ float lowest() { return -FLT_MAX; } + static inline __host__ __device__ float max() { return FLT_MAX; } + static inline __host__ __device__ float lower_bound() { return -static_cast(inf); } + static inline __host__ __device__ float upper_bound() { return static_cast(inf); } +}; + +template <> +struct numeric_limits { + static inline __host__ __device__ double lowest() { return -DBL_MAX; } + static inline __host__ __device__ double max() { return DBL_MAX; } + static inline __host__ __device__ double lower_bound() { return -inf; } + static inline __host__ __device__ double upper_bound() { return inf; } +}; + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/Sleep.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/Sleep.h new file mode 100644 index 0000000000000000000000000000000000000000..d31bf68ccafba92a242894e8f426329693212f5c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/Sleep.h @@ -0,0 +1,10 @@ +#pragma once +#include +#include + +namespace at::cuda { + +// enqueues a kernel that spins for the specified number of cycles +TORCH_CUDA_CU_API void sleep(int64_t cycles); + +} // namespace at::cuda diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h new file mode 100644 index 0000000000000000000000000000000000000000..dddeab1e2675fd791b678eb1da9c81713b8440e1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/CUDAHooks.h @@ -0,0 +1,54 @@ +#pragma once + +#include + +#include +#include + +// TODO: No need to have this whole header, we can just put it all in +// the cpp file + +namespace at::cuda::detail { + +// Set the callback to initialize Magma, which is set by +// torch_cuda_cu. This indirection is required so magma_init is called +// in the same library where Magma will be used. +TORCH_CUDA_CPP_API void set_magma_init_fn(void (*magma_init_fn)()); + + +// The real implementation of CUDAHooksInterface +struct CUDAHooks : public at::CUDAHooksInterface { + CUDAHooks(at::CUDAHooksArgs) {} + void initCUDA() const override; + Device getDeviceFromPtr(void* data) const override; + bool isPinnedPtr(const void* data) const override; + const Generator& getDefaultCUDAGenerator(DeviceIndex device_index = -1) const override; + bool hasCUDA() const override; + bool hasMAGMA() const override; + bool hasCuDNN() const override; + bool hasCuSOLVER() const override; + bool hasROCM() const override; + const at::cuda::NVRTC& nvrtc() const override; + DeviceIndex current_device() const override; + bool hasPrimaryContext(DeviceIndex device_index) const override; + Allocator* getCUDADeviceAllocator() const override; + Allocator* getPinnedMemoryAllocator() const override; + bool compiledWithCuDNN() const override; + bool compiledWithMIOpen() const override; + bool supportsDilatedConvolutionWithCuDNN() const override; + bool supportsDepthwiseConvolutionWithCuDNN() const override; + bool supportsBFloat16ConvolutionWithCuDNNv8() const override; + bool hasCUDART() const override; + long versionCUDART() const override; + long versionCuDNN() const override; + std::string showConfig() const override; + double batchnormMinEpsilonCuDNN() const override; + int64_t cuFFTGetPlanCacheMaxSize(DeviceIndex device_index) const override; + void cuFFTSetPlanCacheMaxSize(DeviceIndex device_index, int64_t max_size) const override; + int64_t cuFFTGetPlanCacheSize(DeviceIndex device_index) const override; + void cuFFTClearPlanCache(DeviceIndex device_index) const override; + int getNumGPUs() const override; + void deviceSynchronize(DeviceIndex device_index) const override; +}; + +} // at::cuda::detail diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h new file mode 100644 index 0000000000000000000000000000000000000000..1eb0e245efd20968bff6c15cac65eedabdd28636 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/DeviceThreadHandles.h @@ -0,0 +1,151 @@ +// Some stateful GPU libraries, such as cuDNN, cuBLAS, use handles to store states. +// These handles are tied to device, and these libraries requires/recommends not to +// share handles across host threads. +// +// These libraries recommend using one handle per host thread. We may not want to do +// this because threads are relatively light-weight, but creating and destroying +// handles is expensive (destroying the handle causes synchronizations). DataParallel, +// for example, creates new threads for each forward pass. +// +// This file implements a handle pool mechanism. The handle pool returns handles on +// demand as threads request them. If all existing handles in the pool are in use, +// it creates a new one. As threads terminate, they release handles back into the pool. +// In this way, the handle pool never creates more handles than the high-water mark of +// active threads, so it's efficient with DataParallel. + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace at::cuda { namespace { + +template +struct DeviceThreadHandlePool : public std::enable_shared_from_this> { + + struct Handle { + Handle_t handle; + Handle(bool create = false) : handle(nullptr) + { + if(create) Create(&handle); + } + // std::vector.emplace() and push_back() may route through temporaries and call + // copy/move constructors along the way. If this is the case, we don't want + // the destructors of temporaries to call cudnnDestroy on the handle. + // We can achieve safety (for the narrow case of stashing within std::vectors) + // by making Handle moveable but not copyable, and transferring handle ownership + // to the latest constructed object. This is not a substitute for full-blown + // reference counting, but reference counting may be overkill here. + // Another alternative is to wrap the saved Handles in unique_ptrs, i.e., + // unordered_map>> created_handles; + Handle(const Handle& rhs) = delete; + // Following https://stackoverflow.com/questions/3279543/what-is-the-copy-and-swap-idiom + Handle(Handle&& rhs) : Handle() { std::swap(handle, rhs.handle); } + // operator= takes argument by value + Handle& operator=(Handle rhs) { std::swap(handle, rhs.handle); return *this; } + ~Handle() { + if(handle) Destroy(handle); + } + }; + + std::mutex mutex; + + // Handles are lazily created as different threads request them, + // but are never destroyed until the end of the process. + // The maximum number of handles this process will create for each device is equal + // to the high-water mark of the number of concurrently active threads that request + // handles for that device. + // When threads terminate, they release their handles back into the pool for reuse. + // Otherwise, new handles would be created every time new threads were spawned, + // resulting in poor performance for Python modules that repeatedly or frequently + // spawned new sets of threads (like DataParallel, which creates a new set of threads + // for each forward pass). + // + // To prevent potential deadlocks, we explicitly choose not to cap the number + // of handles that are created per device. + // Example of danger: If we cap the max handles at 4, and 5 threads are sharing a device, + // only 4 can make forward progress at any time. The other 4 will not release their + // handles until they exit, so the fifth cannot make progress until then. This is + // not a problem...UNLESS all 5 threads attempt some sort of synchronization at an + // intermediate point (ie, before any of them have exited). We have no way to anticipate + // or enforce that user threads will not attempt such intermediate synchronization. + // The only way to ensure safety is to avoid imposing a cap on the number of handles. + std::unordered_map> created_handles; + std::unordered_map> available_handles; + + // PoolWindow lazily creates and caches the handles that a particular thread is using, + // so in the common case handle access doesn't incur either handle creation or a mutex lock. + class PoolWindow + { + public: + PoolWindow(std::shared_ptr parent): weak_parent(std::move(parent)) {} + ~PoolWindow(){ release(); } + + Handle_t reserve(int device) + { + // If this thread already has a handle for this device, return it + if(my_handles.find(device) != my_handles.end()) + return my_handles[device]; + + // otherwise, either grab a handle from the pool if one is available, + // or if not, create a new one. + auto parent = weak_parent.lock(); + TORCH_CHECK(parent, "Cannot create handle during program termination"); + std::lock_guard guard(parent->mutex); + + if(parent->available_handles[device].size() > 0) + { + my_handles[device] = parent->available_handles[device].back(); + parent->available_handles[device].pop_back(); + } + else + { + // In local testing, I do observe that emplace_back sometimes routes through temporaries + // that incur move-constructor and destructor calls. See comments in Handle above. + parent->created_handles[device].emplace_back(true /*create*/); + my_handles[device] = parent->created_handles[device].back().handle; + } + + return my_handles[device]; + } + + private: + // Stores the per-device handles currently owned by this thread + std::unordered_map my_handles; + + std::weak_ptr weak_parent; + + // Called by the destructor. Releases this thread's handles back into the pool. + void release() { + if(my_handles.size() > 0) { + auto parent = weak_parent.lock(); + if (!parent) { + // If this thread exits after atexit handlers have completed, the + // cuda context itself may be invalid, so we must leak the handles. + return; + } + + std::lock_guard guard(parent->mutex); + for(auto d_h : my_handles) + parent->available_handles[d_h.first].push_back(d_h.second); + } + } + }; + + // Warning: + // If you want to change this function, be aware that this function will be called + // by multiple threads and there is no mutex guarding the call of this function, so + // make sure your implementation is thread-safe. + PoolWindow *newPoolWindow() { + // The returned pointer will be owned by a thread local variable + // so that different threads does not share the same PoolWindow. + return new PoolWindow(this->shared_from_this()); + } +}; + +}} // namespace at::cuda::detail:: diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh new file mode 100644 index 0000000000000000000000000000000000000000..db8519389e9ff567b11fd9a3bb31ac3d8646fe92 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/IndexUtils.cuh @@ -0,0 +1,36 @@ +#pragma once + +#include +#include +#include + +namespace at::cuda::detail { + +TORCH_CUDA_CU_API bool maybeOverlappingIndices(const at::TensorBase &t); +using at::native::canUse32BitIndexMath; + +template +TensorInfo +getTensorInfo(const at::TensorBase &t) { + IndexType sz[MAX_TENSORINFO_DIMS]; + IndexType st[MAX_TENSORINFO_DIMS]; + + int dims = t.dim(); + for (int i = 0; i < dims; ++i) { + sz[i] = t.size(i); + st[i] = t.stride(i); + } + + scalar* data_ptr = nullptr; + + if constexpr (std::is_const::value) { + data_ptr = t.const_data_ptr(); + } else { + data_ptr = t.mutable_data_ptr(); + } + + return TensorInfo( + data_ptr, dims, sz, st); +} + +} // namespace at::cuda::detail diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh new file mode 100644 index 0000000000000000000000000000000000000000..e8a26b5e06a6ffeeeaf5df26f09175a2c903aa01 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/IntegerDivider.cuh @@ -0,0 +1,124 @@ +#pragma once + +#include +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) +#include +#endif + +namespace at::cuda::detail { + +// A utility class to implement integer division by multiplication, given a fixed +// divisor. +// +// WARNING: The fast divider algorithm is only implemented for unsigned int; +// otherwise we default to plain integer division. For unsigned int, +// we further assume that the dividend is at most INT32_MAX. Thus, +// IntDivider must NOT be used for general integer division. +// +// This reduced range is enough for our purpose, and it allows us to +// slightly simplify the computation. +// +// (NOTE: Below, "2^k" denotes exponentiation, i.e., 1< 0), we can find a "magic number" m (2^N +// <= m < 2^(N+1)) and shift s such that: +// +// \floor(n / d) = \floor((m * n) / 2^(N+s)). +// +// Given such m and s, the integer division can be then implemented as: +// +// let m' = m - 2^N // 0 <= m' < 2^N +// +// fast_integer_division(n): +// // Multiply two N-bit unsigned integers: the result is a 2N-bit unsigned +// // integer. Then take the higher N bits. +// t = (m' * n) >> N +// +// // Here we use the fact that n is less than 2^(N-1): otherwise the value +// // of (t + n) may not fit in an N-bit integer. +// return (t + n) >> s +// +// Finding such a magic number is surprisingly easy: +// +// s = \ceil(\log_2 d) +// m' = \floor(2^N * (2^s - d) / d) + 1 // Need 2N-bit integer arithmetic. +// +// See also: +// - Division by Invariant Integers Using Multiplication, +// Torbjörn Granlund and Peter L. Montgomery, 1994. +// +// - http://www.hackersdelight.org/magic.htm +// +// - http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html + +// Result of div/mod operation stored together. +template +struct DivMod { + Value div, mod; + + C10_HOST_DEVICE DivMod(Value div, Value mod) : div(div), mod(mod) { } +}; + +// Base case: we only have an implementation for uint32_t for now. For +// everything else, we use plain division. +template +struct IntDivider { + IntDivider() = default; + IntDivider(Value d) : divisor(d) { } + + C10_HOST_DEVICE inline Value div(Value n) const { return n / divisor; } + C10_HOST_DEVICE inline Value mod(Value n) const { return n % divisor; } + C10_HOST_DEVICE inline DivMod divmod(Value n) const { + return DivMod(n / divisor, n % divisor); + } + + Value divisor; +}; + +// Implement fast integer division. +template <> +struct IntDivider { + static_assert(sizeof(unsigned int) == 4, "Assumes 32-bit unsigned int."); + + IntDivider() = default; + + IntDivider(unsigned int d) : divisor(d) { + assert(divisor >= 1 && divisor <= INT32_MAX); + + // TODO: gcc/clang has __builtin_clz() but it's not portable. + for (shift = 0; shift < 32; shift++) if ((1U << shift) >= divisor) break; + + uint64_t one = 1; + uint64_t magic = ((one << 32) * ((one << shift) - divisor)) / divisor + 1; + m1 = magic; + assert(m1 > 0 && m1 == magic); // m1 must fit in 32 bits. + } + + C10_HOST_DEVICE inline unsigned int div(unsigned int n) const { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + // 't' is the higher 32-bits of unsigned 32-bit multiplication of 'n' and + // 'm1'. + unsigned int t = __umulhi(n, m1); + return (t + n) >> shift; +#else + // Using uint64_t so that the addition does not overflow. + uint64_t t = ((uint64_t) n * m1) >> 32; + return (t + n) >> shift; +#endif + } + + C10_HOST_DEVICE inline unsigned int mod(unsigned int n) const { + return n - div(n) * divisor; + } + + C10_HOST_DEVICE inline DivMod divmod(unsigned int n) const { + unsigned int q = div(n); + return DivMod(q, n - q * divisor); + } + + unsigned int divisor; // d above. + unsigned int m1; // Magic number: m' above. + unsigned int shift; // Shift amounts. +}; + +} // namespace at::cuda::detail diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh new file mode 100644 index 0000000000000000000000000000000000000000..2959fe8e65b87e4113f02225079bf1ff7ef8cbc7 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/OffsetCalculator.cuh @@ -0,0 +1,119 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +// If element_sizes is nullptr, then the strides will be in bytes, otherwise +// the strides will be in # of elements. +// Operands that share the same shape, but may have different strides. +// OffsetCalculator iterates the tensor in a column-major order + +#if defined(USE_ROCM) +constexpr int MAX_DIMS = 16; +#else +constexpr int MAX_DIMS = 25; +#endif + +template +struct OffsetCalculator { + // We allow having negative strides to implement some operations like torch.flip + using stride_t = std::conditional_t, + index_t>; + // The offset for each argument. Wrapper around fixed-size array. + // On CUDA, zero sized array is not allowed, so when we are handling nullary + // operators, we need to create a size 1 offset to avoid compiler failure. + // This size 1 offset is just a placeholder, and we will not use it. + using offset_type = at::detail::Array(NARGS, 1)>; + + // if element_sizes is nullptr, then the strides will be in bytes, otherwise + // the strides will be in # of elements. + OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides, const int64_t* element_sizes=nullptr) : dims(dims) { + TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>", MAX_DIMS, ") dims"); + for (int i=0; i < dims; i++){ + sizes_[i] = at::cuda::detail::IntDivider(sizes[i]); + for (int arg = 0; arg < NARGS; arg++) { + int64_t element_size = (element_sizes == nullptr ? 1LL : element_sizes[arg]); + strides_[i][arg] = strides[arg][i] / element_size; + } + } + } + + C10_HOST_DEVICE offset_type get(index_t linear_idx) const { + offset_type offsets; + #pragma unroll + for (int arg = 0; arg < NARGS; arg++) { + offsets[arg] = 0; + } + + #pragma unroll + for (int dim = 0; dim < MAX_DIMS; ++dim) { + if (dim == dims) { + break; + } + auto divmod = sizes_[dim].divmod(linear_idx); + linear_idx = divmod.div; + + #pragma unroll + for (int arg = 0; arg < NARGS; arg++) { + offsets[arg] += divmod.mod * strides_[dim][arg]; + } + + } + return offsets; + } + + int dims; + at::cuda::detail::IntDivider sizes_[MAX_DIMS]; + stride_t strides_[MAX_DIMS][std::max(NARGS, 1)]; +}; + +template +struct TrivialOffsetCalculator { + // The offset for each argument. Wrapper around fixed-size array. + // The offsets are in # of elements, not in bytes. + // On CUDA, zero sized array is not allowed, so when we are handling nullary + // operators, we need to create a size 1 offset to avoid compiler failure. + // This size 1 offset is just a placeholder, and we will not use it. + using offset_type = at::detail::Array(NARGS, 1)>; + + C10_HOST_DEVICE offset_type get(index_t linear_idx) const { + offset_type offsets; + #pragma unroll + for (int arg = 0; arg < NARGS; arg++) { + offsets[arg] = linear_idx; + } + return offsets; + } +}; + +// Make an OffsetCalculator with byte offsets +template +static OffsetCalculator make_offset_calculator(const at::TensorIteratorBase& iter) { + TORCH_INTERNAL_ASSERT(N <= iter.ntensors()); + std::array strides; + for (int i = 0; i < N; i++) { + strides[i] = iter.strides(i).data(); + } + return OffsetCalculator(iter.ndim(), iter.shape().data(), strides.data()); +} + +// Make an OffsetCalculator with element offsets +template +static OffsetCalculator make_element_offset_calculator( + const at::TensorIteratorBase& iter) { + TORCH_INTERNAL_ASSERT(N <= iter.ntensors()); + std::array strides; + std::array element_sizes; + for (int i = 0; i < N; i++) { + strides[i] = iter.strides(i).data(); + element_sizes[i] = iter.element_size(i); + } + return OffsetCalculator( + iter.ndim(), iter.shape().data(), strides.data(), element_sizes.data()); +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh new file mode 100644 index 0000000000000000000000000000000000000000..c9eeeadd542d7702348276d7b538bd49846c5da8 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/PhiloxCudaStateRaw.cuh @@ -0,0 +1,43 @@ +// No "#pragma once" because this is a raw definition that can be copied by jit codegen. +// Eager mode clients should not include this file directly, instead, +// they should #include , which has a #pragma once. + +// Stores RNG state values. Passed as a kernel argument. +// See Note [CUDA Graph-safe RNG states]. +// +// The raw definition lives in its own file so jit codegen can easily copy it. +namespace at { + +struct PhiloxCudaState { + PhiloxCudaState() = default; + // Called if graph capture is not underway + PhiloxCudaState(uint64_t seed, + uint64_t offset) { + seed_.val = seed; + offset_.val = offset; + } + // Called if graph capture is underway + PhiloxCudaState(int64_t* seed, + int64_t* offset_extragraph, + uint32_t offset_intragraph) { + seed_.ptr = seed; + offset_.ptr = offset_extragraph; + offset_intragraph_ = offset_intragraph; + captured_ = true; + } + + // Public members, directly accessible by at::cuda::philox::unpack. + // If we made them private with getters/setters, the getters/setters + // would have to be __device__, and we can't declare __device__ in ATen. + union Payload { + uint64_t val; + int64_t* ptr; + }; + + Payload seed_; + Payload offset_; + uint32_t offset_intragraph_ = 0; + bool captured_ = false; +}; + +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh new file mode 100644 index 0000000000000000000000000000000000000000..a320000ae881faa7416bae6ed1f37793f357a73a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/detail/TensorInfo.cuh @@ -0,0 +1,116 @@ +#pragma once + +#include + +namespace at::cuda::detail { + +#define MAX_TENSORINFO_DIMS 25 + +// CUDA kernel argument that defines tensor layout +template +struct TensorInfo { + TensorInfo(); + TensorInfo(T* p, + int dim, + IndexType sz[MAX_TENSORINFO_DIMS], + IndexType st[MAX_TENSORINFO_DIMS]); + + // Set the size of the given dimension to 1, as if it were a + // reduction dim (allows you to calculate offsets of the reduction + // slice) + void reduceDim(int dim); + + // See note on [collapse dims]. + int collapseDims(const int excludeDim = -1); + + // Contiguous tensors of more than one dimension are collapsed down + // to one tensor + __host__ __device__ inline bool isContiguous() const { + return (dims == 1 && strides[0] == 1); + } + + T* data; + IndexType sizes[MAX_TENSORINFO_DIMS]; + IndexType strides[MAX_TENSORINFO_DIMS]; + int dims; +}; + +template +TensorInfo::TensorInfo() { + data = nullptr; + dims = 0; +} + +template +TensorInfo::TensorInfo(T* p, + int dim, + IndexType sz[MAX_TENSORINFO_DIMS], + IndexType st[MAX_TENSORINFO_DIMS]) { + data = p; + dims = dim; + TORCH_CHECK(dims < MAX_TENSORINFO_DIMS, "CUDA Tensors cannot have more than 25 dimensions"); + + for (int i = 0; i < dim; ++i) { + sizes[i] = sz[i]; + strides[i] = st[i]; + } +} + +template +void +TensorInfo::reduceDim(int dim) { + TORCH_CHECK(dim < dims && dim >= 0, "expected dim between 0 and dims - 1"); + sizes[dim] = 1; +} + +template +int +TensorInfo::collapseDims(const int excludeDim) { + auto result = at::collapse_dims(sizes, strides, dims, excludeDim); + dims = std::get<1>(result); + return std::get<0>(result); +} + +// Translate a linear index for the apply to a T* offset; +// specialized on `Dims` to reduce nvcc compilation time +template +struct IndexToOffset { + static __host__ __device__ IndexType get( + IndexType linearId, + const TensorInfo& info) { + + IndexType offset = 0; + + // Uses static dims + for (int i = Dims - 1; i > 0; --i) { + IndexType curDimIndex = linearId % info.sizes[i]; + IndexType curDimOffset = curDimIndex * info.strides[i]; + offset += curDimOffset; + linearId /= info.sizes[i]; + } + + return offset + linearId * info.strides[0]; + } +}; + +// Uses dynamic (runtime) instead of static (compiletime) dims +template +struct IndexToOffset { + static inline __host__ __device__ IndexType get( + IndexType linearId, + const TensorInfo& info) { + + IndexType offset = 0; + + for (int i = info.dims - 1; i > 0; --i) { + IndexType curDimIndex = linearId % info.sizes[i]; + IndexType curDimOffset = curDimIndex * info.strides[i]; + offset += curDimOffset; + linearId /= info.sizes[i]; + } + + return offset + linearId * info.strides[0]; + } +}; + +} // namespace at::cuda::detail diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/jiterator.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/jiterator.h new file mode 100644 index 0000000000000000000000000000000000000000..a7a440cd7b614732d5ad16eb28c56e71bb20b6e1 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/jiterator.h @@ -0,0 +1,40 @@ +#pragma once +#include + +#if AT_USE_JITERATOR() + +#include +#include +#include + +#include +#include + +namespace at::cuda { + +TORCH_CUDA_CPP_API c10::SmallVector CompileAndLaunchKernel( + const std::string& code_string, + const std::string& kernel_name, + const int num_outputs, + const c10::SmallVector& tensors, + const c10::SmallVector& extra_args, + bool return_by_ref); + +} // namespace at::cuda + +#else + +namespace at::cuda { + +TORCH_CUDA_CPP_API c10::SmallVector CompileAndLaunchKernel( + const std::string& code_string, + const std::string& kernel_name, + const int num_outputs, + const c10::SmallVector& tensors, + const c10::SmallVector& extra_args, + bool return_by_ref) { + TORCH_CHECK(false, "Jiterator is not supported"); + } +} // namespace at::cuda + +#endif // AT_USE_JITERATOR() diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h new file mode 100644 index 0000000000000000000000000000000000000000..ab3ed56796b2f7d7b70fe5f116badaf693bf42d9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/GemmCommon.h @@ -0,0 +1,174 @@ +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include + +#include +#include +#include + +namespace at::cuda::tunable { + +enum class BlasOp { + N = 0, + T = 1 +}; + +inline std::string BlasOpToString(BlasOp op) { + switch (op) { + case BlasOp::N: + return "N"; + case BlasOp::T: + return "T"; + } + TORCH_CHECK(false, "unrecognized BlasOp"); + return "N"; +} + +template +struct GemmParams : OpParams { + std::string Signature() const override { + return c10::str(transa, transb, "_", m, "_", n, "_", k); + } + + GemmParams* DeepCopy() const { + GemmParams* copy = new GemmParams; + *copy = *this; + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + size_t c_size = m * n * sizeof(T); + copy->c = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(c_size)); + AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( + copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); + return copy; + } + + // only call on object returned by DeepCopy + void Delete() { + c10::cuda::CUDACachingAllocator::raw_delete(c); + } + + TuningStatus NumericalCheck(GemmParams *other) { + auto options = at::TensorOptions().dtype(c10::CppTypeToScalarType::value).device(at::kCUDA); + // comparison done as 1D tensor + at::Tensor ref = at::from_blob(c, {m*n}, options); + at::Tensor oth = at::from_blob(other->c, {m*n}, options); + at::Tensor ref_float = ref.to(at::kFloat); + at::Tensor oth_float = oth.to(at::kFloat); + std::vector atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5}; + std::vector rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5}; + double last_succeed_atol = 1; + double last_succeed_rtol = 1; + for (auto& atol : atols) { + for (auto& rtol : rtols) { + if (at::allclose(ref_float, oth_float, rtol, atol)) { + last_succeed_atol = atol; + last_succeed_rtol = rtol; + } + } + } + if (last_succeed_atol == 1) { + return FAIL; + } + else { + TUNABLE_LOG("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol); + } + + return OK; + } + + char transa; + char transb; + int64_t m; + int64_t n; + int64_t k; + at::opmath_type alpha; + const T* a; + int64_t lda; + const T* b; + int64_t ldb; + at::opmath_type beta; + T* c; + int64_t ldc; +}; + +template +struct GemmStridedBatchedParams : OpParams { + std::string Signature() const override { + return c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch); + } + + GemmStridedBatchedParams* DeepCopy() const { + GemmStridedBatchedParams* copy = new GemmStridedBatchedParams; + *copy = *this; + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + size_t c_size = batch * stride_c * sizeof(T); + copy->c = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(c_size)); + AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( + copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); + return copy; + } + + // only call on object returned by DeepCopy + void Delete() { + c10::cuda::CUDACachingAllocator::raw_delete(c); + } + + TuningStatus NumericalCheck(GemmStridedBatchedParams *other) { + auto options = at::TensorOptions().dtype(c10::CppTypeToScalarType::value).device(at::kCUDA); + // comparison done as 1D tensor + at::Tensor ref = at::from_blob(c, {batch*stride_c}, options); + at::Tensor oth = at::from_blob(other->c, {batch*stride_c}, options); + at::Tensor ref_float = ref.to(at::kFloat); + at::Tensor oth_float = oth.to(at::kFloat); + std::vector atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5}; + std::vector rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5}; + double last_succeed_atol = 1; + double last_succeed_rtol = 1; + for (auto& atol : atols) { + for (auto& rtol : rtols) { + if (at::allclose(ref_float, oth_float, rtol, atol)) { + last_succeed_atol = atol; + last_succeed_rtol = rtol; + } + } + } + if (last_succeed_atol == 1) { + return FAIL; + } + else { + TUNABLE_LOG("├──verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol); + } + + return OK; + } + + char transa; + char transb; + int64_t m; + int64_t n; + int64_t k; + at::opmath_type alpha; + const T* a; + int64_t lda; + int64_t stride_a; + const T* b; + int64_t ldb; + int64_t stride_b; + at::opmath_type beta; + T* c; + int64_t ldc; + int64_t stride_c; + int64_t batch; +}; + +} // namespace at::cuda::tunable diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h new file mode 100644 index 0000000000000000000000000000000000000000..fbed75d513b5dcba079b381d754bc7614cb30ac0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/GemmHipblaslt.h @@ -0,0 +1,379 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +#define TORCH_HIPBLASLT_CHECK(EXPR) \ + do { \ + hipblasStatus_t __err = EXPR; \ + TORCH_CHECK(__err == HIPBLAS_STATUS_SUCCESS, \ + "hipblaslt error: ", \ + hipblasStatusToString(__err), \ + " when calling `" #EXPR "`"); \ + } while (0) + +namespace at::cuda::tunable { + +#ifdef HIPBLASLT_HAS_GETINDEXFROMALGO +#define GETINDEXFROMALGO(algo) hipblaslt_ext::getIndexFromAlgo(algo) +#else +static int getIndexFromAlgo(hipblasLtMatmulAlgo_t& algo) { + int* algo_ptr = (int*)algo.data; + if(*algo_ptr < 0) { + return -1; + } + return *algo_ptr; +} +#define GETINDEXFROMALGO(algo) getIndexFromAlgo(algo) +#endif + +#ifdef HIPBLASLT_CUSTOM_COMPUTE_TYPE +#define COMPUTE_TYPE_32 HIPBLASLT_COMPUTE_F32 +#else +#define COMPUTE_TYPE_32 HIPBLAS_COMPUTE_32F +#endif + +#ifdef HIPBLASLT_CUSTOM_DATA_TYPE + +template +constexpr hipblasltDatatype_t HipBlasDataTypeFor(); + +template <> +constexpr hipblasltDatatype_t HipBlasDataTypeFor() { + return HIPBLASLT_R_32F; +} + +template <> +constexpr hipblasltDatatype_t HipBlasDataTypeFor() { + return HIPBLASLT_R_16F; +} + +template <> +constexpr hipblasltDatatype_t HipBlasDataTypeFor() { + return HIPBLASLT_R_16B; +} + +template <> +constexpr hipblasltDatatype_t HipBlasDataTypeFor() { + return HIPBLASLT_R_64F; +} + +#define DATA_TYPE_R_32 HIPBLASLT_R_32F + +#else + +template +constexpr hipblasDatatype_t HipBlasDataTypeFor(); + +template <> +constexpr hipblasDatatype_t HipBlasDataTypeFor() { + return HIPBLAS_R_32F; +} + +template <> +constexpr hipblasDatatype_t HipBlasDataTypeFor() { + return HIPBLAS_R_16F; +} + +template <> +constexpr hipblasDatatype_t HipBlasDataTypeFor() { + return HIPBLAS_R_16B; +} + +template <> +constexpr hipblasDatatype_t HipBlasDataTypeFor() { + return HIPBLAS_R_64F; +} + +#ifdef HIPBLAS_V2 +#define DATA_TYPE_R_32 HIP_R_32F +#else +#define DATA_TYPE_R_32 HIPBLAS_R_32F +#endif + +#endif + +template +int GetBatchFromParams(const ParamsT* params) { + return 1; +} + +template +int GetBatchFromParams(const GemmStridedBatchedParams* params) { + return params->batch; +} + +template +int GetStrideAFromParams(const ParamsT* params) { + return 1; +} + +template +int GetStrideAFromParams(const GemmStridedBatchedParams* params) { + return params->stride_a; +} + +template +int GetStrideBFromParams(const ParamsT* params) { + return 1; +} + +template +int GetStrideBFromParams(const GemmStridedBatchedParams* params) { + return params->stride_b; +} + +template +int GetStrideCFromParams(const ParamsT* params) { + return 1; +} + +template +int GetStrideCFromParams(const GemmStridedBatchedParams* params) { + return params->stride_c; +} + +static hipblasOperation_t _hipblasOpFromChar(char op) { + switch (op) { + case 'n': + case 'N': + return HIPBLAS_OP_N; + case 't': + case 'T': + return HIPBLAS_OP_T; + case 'c': + case 'C': + return HIPBLAS_OP_C; + } + AT_ERROR( + "_hipblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`"); +} + +static char _charFromhipblasOp(hipblasOperation_t op) { + switch (op) { + case HIPBLAS_OP_N: + return 'N'; + case HIPBLAS_OP_T: + return 'T'; + case HIPBLAS_OP_C: + return 'C'; + } + AT_ERROR( + "_charFromhipblasOp input should be HIPBLAS_OP_N/T/C but got `", op, "`"); +} + +static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) { + if (layout == BlasOp::N) { + return HIPBLAS_OP_N; + } + return HIPBLAS_OP_T; +} + +static size_t GetHipblasltWorkspaceSize() { + static const char * env = getenv("HIPBLASLT_WORKSPACE_SIZE"); + // 256MB is max workspace size allowed for hipblaslt + // hipblaslt-bench uses 32MB + // recommendation from hipblaslt author was 76MB + size_t workspace_size = 2*128*1024*1024; // default 256MB + if (env) { + try { + workspace_size = std::stoi(env); + } catch(std::invalid_argument const& e) { + TORCH_WARN("invalid HIPBLASLT_WORKSPACE_SIZE,", + " using default workspace size of ", workspace_size, " bytes."); + } catch(std::out_of_range const& e) { + TORCH_WARN("HIPBLASLT_WORKSPACE_SIZE out of range,", + " using default workspace size of ", workspace_size, " bytes."); + } + } + return workspace_size; +} + +template +class HipblasltGemmOp : public Callable { + public: + HipblasltGemmOp(hipblasLtMatmulAlgo_t algo) : algo_{algo} {} + + TuningStatus Call(const ParamsT* params) override { + hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout); + hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout); + auto in_out_datatype = HipBlasDataTypeFor(); + auto opa = _hipblasOpFromChar(params->transa); + auto opb = _hipblasOpFromChar(params->transb); + + TORCH_CHECK(transa_outer == opa && transb_outer == opb, "trans mismatch, shouldn't happen"); + + float alpha = static_cast(params->alpha); + float beta = static_cast(params->beta); + + hipblasLtMatrixLayout_t mat_a, mat_b, mat_c; + hipblasLtMatmulDesc_t matmul; + if (opa == HIPBLAS_OP_N) { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, in_out_datatype, params->m, params->k, params->lda)); + } + else { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, in_out_datatype, params->k, params->m, params->lda)); + } + if (opb == HIPBLAS_OP_N) { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, in_out_datatype, params->k, params->n, params->ldb)); + } + else { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, in_out_datatype, params->n, params->k, params->ldb)); + } + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_c, in_out_datatype, params->m, params->n, params->ldc)); + TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescCreate(&matmul, COMPUTE_TYPE_32, DATA_TYPE_R_32)); + + int batch = GetBatchFromParams(params); + if (batch > 1) { + int64_t stride_a = GetStrideAFromParams(params); + int64_t stride_b = GetStrideBFromParams(params); + int64_t stride_c = GetStrideCFromParams(params); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_a, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_a, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_a, sizeof(stride_a))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_b, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_b, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_b, sizeof(stride_b))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_c, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c))); + } + + TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescSetAttribute( + matmul, HIPBLASLT_MATMUL_DESC_TRANSA, &opa, sizeof(int32_t))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescSetAttribute( + matmul, HIPBLASLT_MATMUL_DESC_TRANSB, &opb, sizeof(int32_t))); + + size_t workspace_size = GetHipblasltWorkspaceSize(); + + auto op_handle = at::cuda::getCurrentCUDABlasLtHandle(); + + size_t ret_workspace_size = 0; + auto status = hipblaslt_ext::matmulIsAlgoSupported(op_handle, + matmul, + &alpha, + mat_a, + mat_b, + &beta, + mat_c, + mat_c, + algo_, + ret_workspace_size); + + if (status == HIPBLAS_STATUS_SUCCESS) { + if (ret_workspace_size >= workspace_size) { + //TUNABLE_LOG("[hipBLASLt] Solution #", algo_index, " workspace too large"); + return FAIL; + } + } + else { + //TUNABLE_LOG("[hipBLASLt] Solution #", algo_index, " not supported"); + return FAIL; + } + + void* workspace_buffer = nullptr; + if (workspace_size > 0) { + workspace_buffer = c10::cuda::CUDACachingAllocator::raw_alloc(workspace_size); + } + + TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle, + matmul, + &alpha, + params->a, + mat_a, + params->b, + mat_b, + &beta, + params->c, + mat_c, + params->c, + mat_c, + &algo_, + workspace_buffer, + workspace_size, + at::cuda::getCurrentCUDAStream())); + + TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescDestroy(matmul)); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a)); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b)); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c)); + if (workspace_size > 0) { + c10::cuda::CUDACachingAllocator::raw_delete(workspace_buffer); + } + return OK; + } + + private: + hipblasLtMatmulAlgo_t algo_; +}; + +template +auto GetHipBlasLtTypeStringAndOps() { + hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout); + hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout); + auto in_out_datatype = HipBlasDataTypeFor(); + std::vector heuristic_result; + + hipblasLtHandle_t handle; + TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle)); + TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle, + hipblaslt_ext::GemmType::HIPBLASLT_GEMM, + transa_outer, + transb_outer, + in_out_datatype, + in_out_datatype, + in_out_datatype, + in_out_datatype, + COMPUTE_TYPE_32, + heuristic_result)); + TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle)); + + // Sort heuristic_result by algo index to make sure the order of returned algos is deterministic. + std::sort(heuristic_result.begin(), + heuristic_result.end(), + [](hipblasLtMatmulHeuristicResult_t& a, hipblasLtMatmulHeuristicResult_t& b) { + return GETINDEXFROMALGO(a.algo) < GETINDEXFROMALGO(b.algo); + }); + + int returned_algo_count = heuristic_result.size(); + std::vector>>> ret; + for (int i = 0; i < returned_algo_count; i++) { + auto algo = heuristic_result[i].algo; + int algo_index = GETINDEXFROMALGO(algo); + auto callable = std::make_unique>(algo); + std::string type_string = c10::str( + "Gemm_Hipblaslt_", _charFromhipblasOp(transa_outer), _charFromhipblasOp(transb_outer), "_", algo_index); + ret.emplace_back(type_string, std::move(callable)); + } + + return ret; +} + +template +auto GetHipBlasLtGemmTypeStringAndOps() { + return GetHipBlasLtTypeStringAndOps>(); +} + +template +auto GetHipBlasLtGemmStridedBatchedTypeStringAndOps() { + return GetHipBlasLtTypeStringAndOps>(); +} + +#undef TORCH_HIPBLASLT_CHECK +#undef GETINDEXFROMALGO +#undef COMPUTE_TYPE_32 +#undef DATA_TYPE_R_32 + +} // namespace at::cuda::tunable diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h new file mode 100644 index 0000000000000000000000000000000000000000..69889cbbcbfc6f0b03da266f49e92723a38094ea --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/cuda/tunable/StreamTimer.h @@ -0,0 +1,34 @@ +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include + +#include + +namespace at::cuda::tunable { + +class StreamTimer : public ITimer { + public: + StreamTimer(); + virtual ~StreamTimer(); + + void Start() override; + + void End() override; + + float Duration() override; + + private: + cudaEvent_t start_; + cudaEvent_t end_; +}; + +} // namespace at::cuda::tunable diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorConversions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorConversions.h new file mode 100644 index 0000000000000000000000000000000000000000..fa0d58f3c12996b01030ddde6d54a25be0f86c6c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/TensorConversions.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace at { + class Tensor; +namespace native { +bool to_will_alias( + const Tensor& self, + c10::optional dtype, + c10::optional layout, + c10::optional device, + bool copy, + c10::optional optional_memory_format); + +Tensor to_meta(const Tensor& tensor); +c10::optional to_meta(const c10::optional& tensor); +std::vector to_meta(at::ITensorListRef t_list); +Tensor dense_to_sparse_with_mask(const Tensor& self, const Tensor& mask, c10::optional layout, OptionalIntArrayRef blocksize, c10::optional dense_dim_opt); + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/group_norm.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/group_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..1673df9253eec782328db0f673e6526b698641d0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/native/group_norm.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include + +namespace at { +class Tensor; + +namespace native { + +using forward_fn = void (*)( + const Tensor& /* X */, + const Tensor& /* gamma */, + const Tensor& /* beta */, + int64_t /* N */, + int64_t /* C */, + int64_t /* HxW */, + int64_t /* group */, + double /* eps */, + Tensor& /* Y */, + Tensor& /* mean */, + Tensor& /* rstd */); + +using backward_fn = void (*)( + const Tensor& /* dY */, + const Tensor& /* X */, + const Tensor& /* mean */, + const Tensor& /* rstd */, + const Tensor& /* gamma */, + int64_t /* N */, + int64_t /* C */, + int64_t /* HxW */, + int64_t /* group */, + Tensor& /* dX */, + Tensor& /* dgamma */, + Tensor& /* dbeta */); + +DECLARE_DISPATCH(forward_fn, GroupNormKernel); +DECLARE_DISPATCH(backward_fn, GroupNormBackwardKernel); + +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cdist_forward_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cdist_forward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..3c3be9e3b3451a119ea22c8b8000b9116241a82d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_cdist_forward_native.h @@ -0,0 +1,22 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _cdist_forward_out(const at::Tensor & x1, const at::Tensor & x2, double p, c10::optional compute_mode, at::Tensor & out); +TORCH_API at::Tensor _cdist_forward(const at::Tensor & x1, const at::Tensor & x2, double p, c10::optional compute_mode); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_clamp_min_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_clamp_min_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..0859f22c1bd775beac70f4d27609be489e6a20ee --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_clamp_min_cpu_dispatch.h @@ -0,0 +1,28 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::vector _foreach_clamp_min(at::TensorList self, const at::Scalar & scalar); +TORCH_API void _foreach_clamp_min_(at::TensorList self, const at::Scalar & scalar); +TORCH_API ::std::vector _foreach_clamp_min(at::TensorList self, at::TensorList other); +TORCH_API void _foreach_clamp_min_(at::TensorList self, at::TensorList other); +TORCH_API ::std::vector _foreach_clamp_min(at::TensorList self, at::ArrayRef scalars); +TORCH_API void _foreach_clamp_min_(at::TensorList self, at::ArrayRef scalars); + +} // namespace cpu +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_log_softmax_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_log_softmax_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..c6c5c2e24e3dd11aba12b2647e91bf9aa6a99a5d --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_log_softmax_meta_dispatch.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor _log_softmax(const at::Tensor & self, int64_t dim, bool half_to_float); +TORCH_API at::Tensor & _log_softmax_out(at::Tensor & out, const at::Tensor & self, int64_t dim, bool half_to_float); +TORCH_API at::Tensor & _log_softmax_outf(const at::Tensor & self, int64_t dim, bool half_to_float, at::Tensor & out); + +} // namespace meta +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_make_per_tensor_quantized_tensor_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_make_per_tensor_quantized_tensor_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..f9fd0f9ba350eef414b477217538093f52167c82 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_make_per_tensor_quantized_tensor_cpu_dispatch.h @@ -0,0 +1,23 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _make_per_tensor_quantized_tensor(const at::Tensor & self, double scale, int64_t zero_point); + +} // namespace cpu +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_backward_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_backward_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..740ae0590d88a02cd52d34daeda20ff664b7c479 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_backward_compositeexplicitautograd_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & _masked_softmax_backward_out(at::Tensor & out, const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, c10::optional dim=c10::nullopt); +TORCH_API at::Tensor & _masked_softmax_backward_outf(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, c10::optional dim, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_backward_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_backward_native.h new file mode 100644 index 0000000000000000000000000000000000000000..3a2dc2314b0a528a382792c6d327abf41a4c1ab5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_backward_native.h @@ -0,0 +1,23 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & _masked_softmax_backward_out(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, c10::optional dim, at::Tensor & out); +TORCH_API at::Tensor masked_softmax_backward_cpu(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, c10::optional dim=c10::nullopt); +TORCH_API at::Tensor masked_softmax_backward_cuda(const at::Tensor & grad_output, const at::Tensor & output, const at::Tensor & mask, c10::optional dim=c10::nullopt); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..41a7ab61a19e3f7ca833b8cc82f9a29a7b1d7d8c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_masked_softmax_cpu_dispatch.h @@ -0,0 +1,23 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor _masked_softmax(const at::Tensor & self, const at::Tensor & mask, c10::optional dim=c10::nullopt, c10::optional mask_type=c10::nullopt); + +} // namespace cpu +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nnpack_spatial_convolution_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nnpack_spatial_convolution_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..2f0c89167fe0c3cd02a213ae544a4a9b749e6290 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_nnpack_spatial_convolution_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _nnpack_spatial_convolution { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const c10::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_nnpack_spatial_convolution") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_nnpack_spatial_convolution(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1) -> Tensor") + static at::Tensor call(const at::Tensor & input, const at::Tensor & weight, const c10::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride); +}; + +struct TORCH_API _nnpack_spatial_convolution_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const c10::optional &, c10::SymIntArrayRef, c10::SymIntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_nnpack_spatial_convolution") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_nnpack_spatial_convolution.out(Tensor input, Tensor weight, Tensor? bias, SymInt[2] padding, SymInt[2] stride=1, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & input, const at::Tensor & weight, const c10::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & input, const at::Tensor & weight, const c10::optional & bias, c10::SymIntArrayRef padding, c10::SymIntArrayRef stride, at::Tensor & out); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention_backward_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..0640760043364663dffbb85e28d66fd483938be3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_scaled_dot_product_efficient_attention_backward_ops.h @@ -0,0 +1,28 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _scaled_dot_product_efficient_attention_backward { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, double, ::std::array, bool, c10::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_scaled_dot_product_efficient_attention_backward") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_scaled_dot_product_efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor attn_bias, Tensor out, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, float dropout_p, bool[4] grad_input_mask, bool is_causal=False, *, float? scale=None) -> (Tensor, Tensor, Tensor, Tensor)") + static ::std::tuple call(const at::Tensor & grad_out_, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & attn_bias, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & philox_seed, const at::Tensor & philox_offset, double dropout_p, ::std::array grad_input_mask, bool is_causal, c10::optional scale); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad_out_, const at::Tensor & query, const at::Tensor & key, const at::Tensor & value, const at::Tensor & attn_bias, const at::Tensor & out, const at::Tensor & logsumexp, const at::Tensor & philox_seed, const at::Tensor & philox_offset, double dropout_p, ::std::array grad_input_mask, bool is_causal, c10::optional scale); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_serialization_subcmul_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_serialization_subcmul_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..eb6dc75b3cb97cf591f4cc7672de6cb56eb8a14f --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_serialization_subcmul_ops.h @@ -0,0 +1,28 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _test_serialization_subcmul { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const at::Scalar &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_test_serialization_subcmul") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor") + static at::Tensor call(const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & other, const at::Scalar & alpha); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..46fae6926f08ac0bdbf4a70d8355b28a22db029e --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/adaptive_avg_pool3d_backward.h @@ -0,0 +1,34 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self) { + return at::_ops::adaptive_avg_pool3d_backward_grad_input::call(grad_output, self, grad_input); +} +// aten::adaptive_avg_pool3d_backward.grad_input(Tensor grad_output, Tensor self, *, Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & adaptive_avg_pool3d_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, at::Tensor & grad_input) { + return at::_ops::adaptive_avg_pool3d_backward_grad_input::call(grad_output, self, grad_input); +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/avg_pool2d_cpu_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/avg_pool2d_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..946bcb2f7c7d14d53d59e7c4b69190f41eb46cdf --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/avg_pool2d_cpu_dispatch.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor avg_pool2d(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, c10::optional divisor_override=c10::nullopt); +TORCH_API at::Tensor & avg_pool2d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride={}, at::IntArrayRef padding=0, bool ceil_mode=false, bool count_include_pad=true, c10::optional divisor_override=c10::nullopt); +TORCH_API at::Tensor & avg_pool2d_outf(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef stride, at::IntArrayRef padding, bool ceil_mode, bool count_include_pad, c10::optional divisor_override, at::Tensor & out); + +} // namespace cpu +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/binomial_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/binomial_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..1e7987619f6a55cb2f1714a5ca08d0735d35e815 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/binomial_compositeexplicitautograd_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & binomial_out(at::Tensor & out, const at::Tensor & count, const at::Tensor & prob, c10::optional generator=c10::nullopt); +TORCH_API at::Tensor & binomial_outf(const at::Tensor & count, const at::Tensor & prob, c10::optional generator, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..92420e3c34efc509ea881207be885c5d4a80d65a --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fbgemm_linear_int8_weight_compositeimplicitautograd_dispatch.h @@ -0,0 +1,23 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor fbgemm_linear_int8_weight(const at::Tensor & input, const at::Tensor & weight, const at::Tensor & packed, const at::Tensor & col_offsets, const at::Scalar & weight_scale, const at::Scalar & weight_zero_point, const at::Tensor & bias); + +} // namespace compositeimplicitautograd +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fftfreq_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fftfreq_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f03df4121236d2c2cf9e3f339803c422cf3149d5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fft_fftfreq_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API fft_fftfreq { + using schema = at::Tensor (int64_t, double, c10::optional, c10::optional, c10::optional, c10::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::fft_fftfreq") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "fft_fftfreq(int n, float d=1.0, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor") + static at::Tensor call(int64_t n, double d, c10::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, int64_t n, double d, c10::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory); +}; + +struct TORCH_API fft_fftfreq_out { + using schema = at::Tensor & (int64_t, double, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::fft_fftfreq") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "fft_fftfreq.out(int n, float d=1.0, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(int64_t n, double d, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, int64_t n, double d, at::Tensor & out); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/frobenius_norm_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/frobenius_norm_native.h new file mode 100644 index 0000000000000000000000000000000000000000..3eebb53610f50830348c22f242ce0cd8b39db0b8 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/frobenius_norm_native.h @@ -0,0 +1,22 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor frobenius_norm(const at::Tensor & self, at::IntArrayRef dim, bool keepdim=false); +TORCH_API at::Tensor & frobenius_norm_out(const at::Tensor & self, at::IntArrayRef dim, bool keepdim, at::Tensor & out); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/histogram_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/histogram_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..d590f4f9d779741173dcd79a18fc8d2e4d07e5a5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/histogram_ops.h @@ -0,0 +1,61 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API histogram_bins_tensor_out { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const c10::optional &, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::histogram") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "bins_tensor_out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "histogram.bins_tensor_out(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)") + static ::std::tuple call(const at::Tensor & self, const at::Tensor & bins, const c10::optional & weight, bool density, at::Tensor & hist, at::Tensor & bin_edges); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & bins, const c10::optional & weight, bool density, at::Tensor & hist, at::Tensor & bin_edges); +}; + +struct TORCH_API histogram_bins_tensor { + using schema = ::std::tuple (const at::Tensor &, const at::Tensor &, const c10::optional &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::histogram") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "bins_tensor") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "histogram.bins_tensor(Tensor self, Tensor bins, *, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)") + static ::std::tuple call(const at::Tensor & self, const at::Tensor & bins, const c10::optional & weight, bool density); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & bins, const c10::optional & weight, bool density); +}; + +struct TORCH_API histogram_bin_ct_out { + using schema = ::std::tuple (const at::Tensor &, int64_t, c10::optional>, const c10::optional &, bool, at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::histogram") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "bin_ct_out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "histogram.bin_ct_out(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False, Tensor(a!) hist, Tensor(b!) bin_edges) -> (Tensor(a!) hist, Tensor(b!) bin_edges)") + static ::std::tuple call(const at::Tensor & self, int64_t bins, c10::optional> range, const c10::optional & weight, bool density, at::Tensor & hist, at::Tensor & bin_edges); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t bins, c10::optional> range, const c10::optional & weight, bool density, at::Tensor & hist, at::Tensor & bin_edges); +}; + +struct TORCH_API histogram_bin_ct { + using schema = ::std::tuple (const at::Tensor &, int64_t, c10::optional>, const c10::optional &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::histogram") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "bin_ct") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "histogram.bin_ct(Tensor self, int bins=100, *, float[]? range=None, Tensor? weight=None, bool density=False) -> (Tensor hist, Tensor bin_edges)") + static ::std::tuple call(const at::Tensor & self, int64_t bins, c10::optional> range, const c10::optional & weight, bool density); + static ::std::tuple redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t bins, c10::optional> range, const c10::optional & weight, bool density); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hypot_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hypot_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..a090ebcd807b51431038fbc01ec2ec46986aaafa --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/hypot_cuda_dispatch.h @@ -0,0 +1,26 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor hypot(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & hypot_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & hypot_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & hypot_(at::Tensor & self, const at::Tensor & other); + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/im2col_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/im2col_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..43ab5e789844440fd79ba3a29d0e68474f9ba3b0 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/im2col_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API im2col_out { + using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::im2col") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "im2col.out(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride, at::Tensor & out); +}; + +struct TORCH_API im2col { + using schema = at::Tensor (const at::Tensor &, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::im2col") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "im2col(Tensor self, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride) -> Tensor") + static at::Tensor call(const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef kernel_size, at::IntArrayRef dilation, at::IntArrayRef padding, at::IntArrayRef stride); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_select_backward.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_select_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..b74455fdace9b17d4483a78e1b12213d1340d197 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/index_select_backward.h @@ -0,0 +1,47 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::index_select_backward(Tensor grad, SymInt[] self_sizes, int dim, Tensor index) -> Tensor +inline at::Tensor index_select_backward(const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index) { + return at::_ops::index_select_backward::call(grad, c10::fromIntArrayRefSlow(self_sizes), dim, index); +} +namespace symint { + template ::value>> + at::Tensor index_select_backward(const at::Tensor & grad, at::IntArrayRef self_sizes, int64_t dim, const at::Tensor & index) { + return at::_ops::index_select_backward::call(grad, c10::fromIntArrayRefSlow(self_sizes), dim, index); + } +} + +// aten::index_select_backward(Tensor grad, SymInt[] self_sizes, int dim, Tensor index) -> Tensor +inline at::Tensor index_select_backward_symint(const at::Tensor & grad, c10::SymIntArrayRef self_sizes, int64_t dim, const at::Tensor & index) { + return at::_ops::index_select_backward::call(grad, self_sizes, dim, index); +} +namespace symint { + template ::value>> + at::Tensor index_select_backward(const at::Tensor & grad, c10::SymIntArrayRef self_sizes, int64_t dim, const at::Tensor & index) { + return at::_ops::index_select_backward::call(grad, self_sizes, dim, index); + } +} + +} diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/kl_div_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/kl_div_native.h new file mode 100644 index 0000000000000000000000000000000000000000..3e7152e3146bdfa4350304d5acf36e1fc50b66da --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/kl_div_native.h @@ -0,0 +1,21 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor kl_div(const at::Tensor & self, const at::Tensor & target, int64_t reduction=at::Reduction::Mean, bool log_target=false); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/kl_div_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/kl_div_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..fe0df1d7a577fdf6ccb079f55c35b7f4fc34373c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/kl_div_ops.h @@ -0,0 +1,28 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API kl_div { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, int64_t, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::kl_div") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "kl_div(Tensor self, Tensor target, int reduction=Mean, *, bool log_target=False) -> Tensor") + static at::Tensor call(const at::Tensor & self, const at::Tensor & target, int64_t reduction, bool log_target); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & target, int64_t reduction, bool log_target); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_lu_factor_ex_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_lu_factor_ex_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..35164c499520db75db623591e1af6cf3027b56f5 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/linalg_lu_factor_ex_meta_dispatch.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API ::std::tuple linalg_lu_factor_ex(const at::Tensor & A, bool pivot=true, bool check_errors=false); +TORCH_API ::std::tuple linalg_lu_factor_ex_out(at::Tensor & LU, at::Tensor & pivots, at::Tensor & info, const at::Tensor & A, bool pivot=true, bool check_errors=false); +TORCH_API ::std::tuple linalg_lu_factor_ex_outf(const at::Tensor & A, bool pivot, bool check_errors, at::Tensor & LU, at::Tensor & pivots, at::Tensor & info); + +} // namespace meta +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nansum_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nansum_native.h new file mode 100644 index 0000000000000000000000000000000000000000..4f97d27f3c667d32332fba668737cf224b205ba3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nansum_native.h @@ -0,0 +1,22 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor nansum(const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, bool keepdim=false, c10::optional dtype=c10::nullopt); +TORCH_API at::Tensor & nansum_out(const at::Tensor & self, at::OptionalIntArrayRef dim, bool keepdim, c10::optional dtype, at::Tensor & out); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_backward_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..40af820f1cab913de99c93fccb62bfdec0097eb9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_backward_cuda_dispatch.h @@ -0,0 +1,28 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor nll_loss_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight); +TORCH_API at::Tensor nll_loss_backward_symint(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional & weight, int64_t reduction, c10::SymInt ignore_index, const at::Tensor & total_weight); +TORCH_API at::Tensor & nll_loss_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight); +TORCH_API at::Tensor & nll_loss_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional & weight, int64_t reduction, int64_t ignore_index, const at::Tensor & total_weight, at::Tensor & grad_input); +TORCH_API at::Tensor & nll_loss_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional & weight, int64_t reduction, c10::SymInt ignore_index, const at::Tensor & total_weight); +TORCH_API at::Tensor & nll_loss_backward_symint_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Tensor & target, const c10::optional & weight, int64_t reduction, c10::SymInt ignore_index, const at::Tensor & total_weight, at::Tensor & grad_input); + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/prod_compositeexplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/prod_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..302a701b93cf248570a8458f97bc5b62048c1ed9 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/prod_compositeexplicitautograd_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & prod_out(at::Tensor & out, const at::Tensor & self, c10::optional dtype=c10::nullopt); +TORCH_API at::Tensor & prod_outf(const at::Tensor & self, c10::optional dtype, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/q_scale_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/q_scale_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..31a6179d6b776a7ef1dafcd4db0e3ba3a5232d89 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/q_scale_ops.h @@ -0,0 +1,28 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API q_scale { + using schema = double (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::q_scale") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "q_scale(Tensor self) -> float") + static double call(const at::Tensor & self); + static double redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/quantized_rnn_tanh_cell_compositeimplicitautograd_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/quantized_rnn_tanh_cell_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..ff03265ee98eb7ef9b1c48b26a5a4cf758fd6fbe --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/quantized_rnn_tanh_cell_compositeimplicitautograd_dispatch.h @@ -0,0 +1,23 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor quantized_rnn_tanh_cell(const at::Tensor & input, const at::Tensor & hx, const at::Tensor & w_ih, const at::Tensor & w_hh, const at::Tensor & b_ih, const at::Tensor & b_hh, const at::Tensor & packed_ih, const at::Tensor & packed_hh, const at::Tensor & col_offsets_ih, const at::Tensor & col_offsets_hh, const at::Scalar & scale_ih, const at::Scalar & scale_hh, const at::Scalar & zero_point_ih, const at::Scalar & zero_point_hh); + +} // namespace compositeimplicitautograd +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softplus_backward_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softplus_backward_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..03ff582b7db9e530c6867700bd2aac7d741ca4f3 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/softplus_backward_cuda_dispatch.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor softplus_backward(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold); +TORCH_API at::Tensor & softplus_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold); +TORCH_API at::Tensor & softplus_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, const at::Scalar & beta, const at::Scalar & threshold, at::Tensor & grad_input); + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_t_cuda_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_t_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..15e8582a6f13d701aeb1674dbffb775a81d0090b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_chebyshev_polynomial_t_cuda_dispatch.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor special_chebyshev_polynomial_t(const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_t_out(at::Tensor & out, const at::Tensor & x, const at::Tensor & n); +TORCH_API at::Tensor & special_chebyshev_polynomial_t_outf(const at::Tensor & x, const at::Tensor & n, at::Tensor & out); + +} // namespace cuda +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tan_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tan_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..4ab10b6feef4def8c7125c3430f05ded3b563144 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tan_meta_dispatch.h @@ -0,0 +1,26 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor tan(const at::Tensor & self); +TORCH_API at::Tensor & tan_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & tan_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & tan_(at::Tensor & self); + +} // namespace meta +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/to_mkldnn_ops.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/to_mkldnn_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..17a1e32b0aef00d8bb1ed2940da5ce8c7ca4f37b --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/to_mkldnn_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API to_mkldnn { + using schema = at::Tensor (const at::Tensor &, c10::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::to_mkldnn") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "to_mkldnn(Tensor self, ScalarType? dtype=None) -> Tensor") + static at::Tensor call(const at::Tensor & self, c10::optional dtype); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional dtype); +}; + +struct TORCH_API to_mkldnn_out { + using schema = at::Tensor & (const at::Tensor &, c10::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::to_mkldnn") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "to_mkldnn.out(Tensor self, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & self, c10::optional dtype, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, c10::optional dtype, at::Tensor & out); +}; + +}} // namespace at::_ops diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/to_sparse_bsc_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/to_sparse_bsc_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b9e5ae864c27e3e2fc078fe1ea8c48c25cc50739 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/to_sparse_bsc_native.h @@ -0,0 +1,21 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor to_sparse_bsc(const at::Tensor & self, at::IntArrayRef blocksize, c10::optional dense_dim=c10::nullopt); +} // namespace native +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_trilinear3d_backward_meta_dispatch.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_trilinear3d_backward_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7483fe46a0dd1ea45d5f11a63758a09a40a4334c --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_trilinear3d_backward_meta_dispatch.h @@ -0,0 +1,28 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor upsample_trilinear3d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional scales_d=c10::nullopt, c10::optional scales_h=c10::nullopt, c10::optional scales_w=c10::nullopt); +TORCH_API at::Tensor upsample_trilinear3d_backward_symint(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional scales_d=c10::nullopt, c10::optional scales_h=c10::nullopt, c10::optional scales_w=c10::nullopt); +TORCH_API at::Tensor & upsample_trilinear3d_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional scales_d=c10::nullopt, c10::optional scales_h=c10::nullopt, c10::optional scales_w=c10::nullopt); +TORCH_API at::Tensor & upsample_trilinear3d_backward_outf(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional scales_d, c10::optional scales_h, c10::optional scales_w, at::Tensor & grad_input); +TORCH_API at::Tensor & upsample_trilinear3d_backward_symint_out(at::Tensor & grad_input, const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional scales_d=c10::nullopt, c10::optional scales_h=c10::nullopt, c10::optional scales_w=c10::nullopt); +TORCH_API at::Tensor & upsample_trilinear3d_backward_symint_outf(const at::Tensor & grad_output, c10::SymIntArrayRef output_size, c10::SymIntArrayRef input_size, bool align_corners, c10::optional scales_d, c10::optional scales_h, c10::optional scales_w, at::Tensor & grad_input); + +} // namespace meta +} // namespace at diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/var_mean_native.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/var_mean_native.h new file mode 100644 index 0000000000000000000000000000000000000000..120fa1e4ff7e8f8dceb95d34fc943240994481f8 --- /dev/null +++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/var_mean_native.h @@ -0,0 +1,26 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::tuple var_mean(const at::Tensor & self, bool unbiased=true); +TORCH_API ::std::tuple var_mean(const at::Tensor & self, at::OptionalIntArrayRef dim, bool unbiased=true, bool keepdim=false); +TORCH_API ::std::tuple var_mean_correction_out(const at::Tensor & self, at::OptionalIntArrayRef dim, const c10::optional & correction, bool keepdim, at::Tensor & out0, at::Tensor & out1); +TORCH_API ::std::tuple var_mean(const at::Tensor & self, at::OptionalIntArrayRef dim=c10::nullopt, const c10::optional & correction=c10::nullopt, bool keepdim=false); +TORCH_API ::std::tuple var_mean(const at::Tensor & self, at::DimnameList dim, bool unbiased=true, bool keepdim=false); +TORCH_API ::std::tuple var_mean(const at::Tensor & self, at::DimnameList dim, const c10::optional & correction=c10::nullopt, bool keepdim=false); +} // namespace native +} // namespace at