diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/EmptyTensor.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/EmptyTensor.h new file mode 100644 index 0000000000000000000000000000000000000000..39b206cb3031d7fbdcc704c86453791adf81e15e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/EmptyTensor.h @@ -0,0 +1,29 @@ +// Copyright © 2022 Apple Inc. + +#pragma once +#include + +namespace at::detail { + +C10_EXPORT TensorBase empty_mps( + IntArrayRef size, + std::optional dtype_opt, + std::optional layout_opt, + std::optional device_opt, + std::optional pin_memory_opt, + std::optional memory_format_opt); +C10_EXPORT TensorBase empty_mps( + IntArrayRef size, const TensorOptions &options); + +C10_EXPORT TensorBase empty_strided_mps( + IntArrayRef size, + IntArrayRef stride, + ScalarType dtype, + std::optional device_opt); + +C10_EXPORT TensorBase empty_strided_mps( + IntArrayRef size, + IntArrayRef stride, + const TensorOptions &options); + +} // namespace at::detail diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/IndexKernels.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/IndexKernels.h new file mode 100644 index 0000000000000000000000000000000000000000..21b639da3e48744608061bdccb472223e89368f7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/IndexKernels.h @@ -0,0 +1,535 @@ +#pragma once + +namespace at::mps { + +static const char * indexing_metal_shaders = R"INDEX_METAL( +#include +#include + +using namespace metal; + +struct IndexAB { + constant int64_t* indexArray; +}; + +template +kernel void index_select( + constant IndexAB * indexAB [[buffer(0)]], + constant void * indexSizes [[buffer(1)]], + constant void * indexStrides [[buffer(2)]], + constant OffsetsT * offsets [[buffer(3)]], + constant void * inputData [[buffer(4)]], + device void * outputData [[buffer(5)]], + constant uint32_t & num_indices [[buffer(6)]], + uint thread_index [[thread_position_in_grid]]) { + constant int64_t * index_sizes = (constant int64_t *)indexSizes; + constant int64_t * index_strides = (constant int64_t *)indexStrides; + int64_t offset = 0; + for (uint32_t i = 0; i < num_indices; i++) { + constant int64_t* indexArray = indexAB[i].indexArray; + int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)]; + if (index < 0) { + index += index_sizes[i]; + } + offset += index * index_strides[i]; + } + device T * out = (device T*)((device char*)outputData + offsets[thread_index].x); + constant T * in = (constant T*)((constant char*)inputData + offsets[thread_index].y + offset); + *out = *in; +} + +template +void index_put_impl( + constant IndexAB * indexAB, + constant int64_t * index_sizes, + constant int64_t * index_strides, + constant OffsetsT * offsets, + constant void * inputData, + device void * outputData, + constant uint32_t & num_indices, + uint thread_index) { + int64_t offset = 0; + for (uint32_t i = 0; i < num_indices; i++) { + constant int64_t* indexArray = indexAB[i].indexArray; + int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)]; + + if (index < 0) { + index += index_sizes[i]; + } + offset += index * index_strides[i]; + } + device T * out = (device T*)((device char*)outputData + offsets[thread_index].x + offset); + constant T * in = (constant T*)((constant char*)inputData + offsets[thread_index].y); + *out = *in; +} + +template +kernel void index_put_serial( + constant IndexAB * indexAB [[buffer(0)]], + constant void * indexSizes [[buffer(1)]], + constant void * indexStrides [[buffer(2)]], + constant OffsetsT * offsets [[buffer(3)]], + constant void * inputData [[buffer(4)]], + device void * outputData [[buffer(5)]], + constant uint32_t & num_indices [[buffer(6)]], + constant uint * numIters [[buffer(7)]], + uint thread_index [[thread_position_in_grid]]) { + + constant int64_t * index_sizes = (constant int64_t *)indexSizes; + constant int64_t * index_strides = (constant int64_t *)indexStrides; + + for (uint iter_i = 0; iter_i < *numIters; iter_i++) { + index_put_impl(indexAB, index_sizes, index_strides, offsets, inputData, outputData, num_indices, iter_i); + } +} + +template +kernel void index_put( + constant IndexAB * indexAB [[buffer(0)]], + constant void * indexSizes [[buffer(1)]], + constant void * indexStrides [[buffer(2)]], + constant OffsetsT * offsets [[buffer(3)]], + constant void * inputData [[buffer(4)]], + device void * outputData [[buffer(5)]], + constant uint32_t & num_indices [[buffer(6)]], + uint thread_index [[thread_position_in_grid]]) { + + constant int64_t * index_sizes = (constant int64_t *)indexSizes; + constant int64_t * index_strides = (constant int64_t *)indexStrides; + index_put_impl(indexAB, index_sizes, index_strides, offsets, inputData, outputData, num_indices, thread_index); +} + +#define REGISTER_INDEX_OP(DTYPE_SIZE, IDX_SIZE, DTYPE, INDEX_OP_TYPE, IDX_DTYPE) \ +template \ +[[host_name("index_" #INDEX_OP_TYPE "_" #DTYPE_SIZE "_" #IDX_SIZE)]] \ +kernel void index_ ## INDEX_OP_TYPE( \ + constant IndexAB * indexAB [[buffer(0)]], \ + constant void * indexSizes [[buffer(1)]], \ + constant void * indexStrides [[buffer(2)]], \ + constant IDX_DTYPE * offsets [[buffer(3)]], \ + constant void * inputData [[buffer(4)]], \ + device void * outputData [[buffer(5)]], \ + constant uint32_t & num_indices [[buffer(6)]], \ + uint thread_index [[thread_position_in_grid]]); + +#define REGISTER_INDEX_OP_ALL_DTYPES(INDEX_OP_TYPE) \ + REGISTER_INDEX_OP(8bit, idx32, char, INDEX_OP_TYPE, uint3); \ + REGISTER_INDEX_OP(8bit, idx64, char, INDEX_OP_TYPE, ulong3); \ + REGISTER_INDEX_OP(16bit, idx32, short, INDEX_OP_TYPE, uint3); \ + REGISTER_INDEX_OP(16bit, idx64, short, INDEX_OP_TYPE, ulong3); \ + REGISTER_INDEX_OP(32bit, idx32, int, INDEX_OP_TYPE, uint3); \ + REGISTER_INDEX_OP(32bit, idx64, int, INDEX_OP_TYPE, ulong3); \ + REGISTER_INDEX_OP(64bit, idx32, long, INDEX_OP_TYPE, uint3); \ + REGISTER_INDEX_OP(64bit, idx64, long, INDEX_OP_TYPE, ulong3); + +REGISTER_INDEX_OP_ALL_DTYPES(select); +REGISTER_INDEX_OP_ALL_DTYPES(put); + +#define REGISTER_SINGLE_THREADED_INDEX_OP(DTYPE_SIZE, IDX_SIZE, DTYPE, INDEX_OP_TYPE, IDX_DTYPE) \ +template \ +[[host_name("index_" #INDEX_OP_TYPE "_" #DTYPE_SIZE "_" #IDX_SIZE)]] \ +kernel void index_ ## INDEX_OP_TYPE( \ + constant IndexAB * indexAB [[buffer(0)]], \ + constant void * indexSizes [[buffer(1)]], \ + constant void * indexStrides [[buffer(2)]], \ + constant IDX_DTYPE * offsets [[buffer(3)]], \ + constant void * inputData [[buffer(4)]], \ + device void * outputData [[buffer(5)]], \ + constant uint32_t & num_indices [[buffer(6)]], \ + constant uint * numIters [[buffer(7)]], \ + uint thread_index [[thread_position_in_grid]]); + +#define REGISTER_SINGLE_THREADED_INDEX_OP_ALL_DTYPES(INDEX_OP_TYPE) \ + REGISTER_SINGLE_THREADED_INDEX_OP(8bit, idx32, char, INDEX_OP_TYPE, uint3); \ + REGISTER_SINGLE_THREADED_INDEX_OP(8bit, idx64, char, INDEX_OP_TYPE, ulong3); \ + REGISTER_SINGLE_THREADED_INDEX_OP(16bit, idx32, short, INDEX_OP_TYPE, uint3); \ + REGISTER_SINGLE_THREADED_INDEX_OP(16bit, idx64, short, INDEX_OP_TYPE, ulong3); \ + REGISTER_SINGLE_THREADED_INDEX_OP(32bit, idx32, int, INDEX_OP_TYPE, uint3); \ + REGISTER_SINGLE_THREADED_INDEX_OP(32bit, idx64, int, INDEX_OP_TYPE, ulong3); \ + REGISTER_SINGLE_THREADED_INDEX_OP(64bit, idx32, long, INDEX_OP_TYPE, uint3); \ + REGISTER_SINGLE_THREADED_INDEX_OP(64bit, idx64, long, INDEX_OP_TYPE, ulong3); + +REGISTER_SINGLE_THREADED_INDEX_OP_ALL_DTYPES(put_serial); + +template +kernel void kernel_index_offsets(constant StridesT * strides [[buffer(0)]], + device DataT * data_offsets [[buffer(1)]], + constant uint * iter_shape [[buffer(2)]], + constant uint & num_dimensions [[buffer(3)]], + uint thread_index [[thread_position_in_grid]]) { + data_offsets[thread_index] = 0; + uint32_t idx = thread_index; + for (uint32_t dim = 0; dim < num_dimensions; dim++) { + uint32_t remainder = idx % iter_shape[dim]; + idx /= iter_shape[dim]; + + data_offsets[thread_index] += remainder * DataT(strides[dim]); + } +} + +template +[[host_name("kernel_index_offsets_32")]] +kernel void kernel_index_offsets( + constant packed_uint3 * strides [[buffer(0)]], + device uint3 * data_offsets [[buffer(1)]], + constant uint * iter_shape [[buffer(2)]], + constant uint & num_dimensions [[buffer(3)]], + uint thread_index [[thread_position_in_grid]]); + +template +[[host_name("kernel_index_offsets_64")]] +kernel void kernel_index_offsets( + constant packed_uint3 * strides [[buffer(0)]], + device ulong3 * data_offsets [[buffer(1)]], + constant uint * iter_shape [[buffer(2)]], + constant uint & num_dimensions [[buffer(3)]], + uint thread_index [[thread_position_in_grid]]); + +template +kernel void index_put_accumulate_native_dtypes( + constant IndexAB * indexAB [[buffer(0)]], + constant void * indexSizes [[buffer(1)]], + constant void * indexStrides [[buffer(2)]], + constant OffsetsT * offsets [[buffer(3)]], + constant void * inputData [[buffer(4)]], + device void * outputData [[buffer(5)]], + constant uint32_t & num_indices [[buffer(6)]], + uint thread_index [[thread_position_in_grid]]) { + constant int64_t * index_sizes = (constant int64_t *)indexSizes; + constant int64_t * index_strides = (constant int64_t *)indexStrides; + int64_t offset = 0; + for (uint32_t i = 0; i < num_indices; i++) { + constant int64_t* indexArray = indexAB[i].indexArray; + int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)]; + if (index < 0) { + index += index_sizes[i]; + } + offset += index * index_strides[i]; + } + device T * out = (device T*)((device char*)outputData + offsets[thread_index].x + offset); + constant E * in = (constant E*)((constant char*)inputData + offsets[thread_index].y); + atomic_fetch_add_explicit(out, *in, memory_order_relaxed); +} + +template +__attribute__((__always_inline__)) void atomic_fetch_add_relaxed(device void * addr, T value) { + device atomic_uint* uintAddr = (device atomic_uint*)addr; + uint expected = atomic_load_explicit(uintAddr, memory_order_relaxed); + T updated = as_type(expected) + value; + while (!atomic_compare_exchange_weak_explicit(uintAddr, &expected, as_type(updated), memory_order_relaxed, memory_order_relaxed)) { + updated = as_type(expected) + value; + } +} + +template +kernel void atomic_index_put_accumulate( + constant IndexAB * indexAB [[buffer(0)]], + constant void * indexSizes [[buffer(1)]], + constant void * indexStrides [[buffer(2)]], + constant OffsetsT * offsets [[buffer(3)]], + constant void * inputData [[buffer(4)]], + device void * outputData [[buffer(5)]], + constant uint32_t & num_indices [[buffer(6)]], + uint thread_index [[thread_position_in_grid]]) { + constant int64_t * index_sizes = (constant int64_t *)indexSizes; + constant int64_t * index_strides = (constant int64_t *)indexStrides; + int64_t offset = 0; + for (uint32_t i = 0; i < num_indices; i++) { + constant int64_t* indexArray = indexAB[i].indexArray; + int64_t index = indexArray[offsets[thread_index].z / sizeof(int64_t)]; + if (index < 0) { + index += index_sizes[i]; + } + offset += index * index_strides[i]; + } + device void * out = (device void*)((device char*)outputData + offsets[thread_index].x + offset); + constant T * in = (constant T*)((constant char*)inputData + offsets[thread_index].y); + atomic_fetch_add_relaxed(out, *in); +} + +template +[[host_name("index_put_accumulate_32bit_float_idx32")]] +kernel void atomic_index_put_accumulate( + constant IndexAB * indexAB [[buffer(0)]], + constant void * indexSizes [[buffer(1)]], + constant void * indexStrides [[buffer(2)]], + constant uint3 * offsets [[buffer(3)]], + constant void * inputData [[buffer(4)]], + device void * outputData [[buffer(5)]], + constant uint32_t & num_indices [[buffer(6)]], + uint thread_index [[thread_position_in_grid]]); + +template +[[host_name("index_put_accumulate_32bit_float_idx64")]] +kernel void atomic_index_put_accumulate( + constant IndexAB * indexAB [[buffer(0)]], + constant void * indexSizes [[buffer(1)]], + constant void * indexStrides [[buffer(2)]], + constant ulong3 * offsets [[buffer(3)]], + constant void * inputData [[buffer(4)]], + device void * outputData [[buffer(5)]], + constant uint32_t & num_indices [[buffer(6)]], + uint thread_index [[thread_position_in_grid]]); + +template +[[host_name("index_put_accumulate_32bit_int_idx32")]] +kernel void index_put_accumulate_native_dtypes( + constant IndexAB * indexAB [[buffer(0)]], + constant void * indexSizes [[buffer(1)]], + constant void * indexStrides [[buffer(2)]], + constant uint3 * offsets [[buffer(3)]], + constant void * inputData [[buffer(4)]], + device void * outputData [[buffer(5)]], + constant uint32_t & num_indices [[buffer(6)]], + uint thread_index [[thread_position_in_grid]]); + +template +[[host_name("index_put_accumulate_32bit_int_idx64")]] +kernel void index_put_accumulate_native_dtypes( + constant IndexAB * indexAB [[buffer(0)]], + constant void * indexSizes [[buffer(1)]], + constant void * indexStrides [[buffer(2)]], + constant ulong3 * offsets [[buffer(3)]], + constant void * inputData [[buffer(4)]], + device void * outputData [[buffer(5)]], + constant uint32_t & num_indices [[buffer(6)]], + uint thread_index [[thread_position_in_grid]]); +)INDEX_METAL"; + +static const char *SCATTER_OPS_TEMPLATE = R"METAL_SCATTER( +struct __attribute__ ((packed)) packed_uint5{{ + uint32_t x; uint32_t y; uint32_t z; uint32_t w; uint32_t u; +}}; + +template +Y cast(const X x); + +template<> +{1} cast<{1}, {0}>(const {0} x) {{ + return {2}; +}} + +kernel void scatter_kernel_5(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint5 & size [[buffer(2)]], + constant packed_uint5 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + packed_uint5 local_index; + local_index.x = linear_index / (size.u * size.w * size.z * size.y) % size.x; + local_index.y = linear_index / (size.u * size.w * size.z) % size.y; + local_index.z = linear_index / (size.u * size.w) % size.z; + local_index.w = linear_index / size.u % size.w; + local_index.u = linear_index % size.u; + + packed_uint5 strided_index; + strided_index.x = local_index.x * stride.x; + strided_index.y = local_index.y * stride.y; + strided_index.z = local_index.z * stride.z; + strided_index.w = local_index.w * stride.w; + strided_index.u = local_index.u * stride.u; + + dst[strided_index.x + strided_index.y + strided_index.z + strided_index.w + strided_index.u] = cast<{1}>(src[linear_index]); +}} + +kernel void scatter_kernel_4(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint4 & size [[buffer(2)]], + constant packed_uint4 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + packed_uint4 local_index; + local_index.x = linear_index / (size[3] * size[2] * size[1]) % size[0]; + local_index.y = linear_index / (size[3] * size[2]) % size[1]; + local_index.z = linear_index / size[3] % size[2]; + local_index.w = linear_index % size[3]; + + const packed_uint4 strided_index = local_index * stride; + dst[strided_index.x + strided_index.y + strided_index.z + strided_index.w] = cast<{1}>(src[linear_index]); +}} + +kernel void scatter_kernel_3(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint3 & size [[buffer(2)]], + constant packed_uint3 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + packed_uint3 local_index; + local_index.x = linear_index / (size[2] * size[1]) % size[0]; + local_index.y = linear_index / size[2] % size[1]; + local_index.z = linear_index % size[2]; + + const packed_uint3 strided_index = local_index * stride; + dst[strided_index.x + strided_index.y + strided_index.z] = cast<{1}>(src[linear_index]); +}} + +kernel void scatter_kernel_2(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint2 & size [[buffer(2)]], + constant packed_uint2 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + packed_uint2 local_index; + local_index.x = linear_index / size[1] % size[0]; + local_index.y = linear_index % size[1]; + + const packed_uint2 strided_index = local_index * stride; + dst[strided_index.x + strided_index.y] = cast<{1}>(src[linear_index]); +}} + +kernel void scatter_kernel_1(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant int & size [[buffer(2)]], + constant int & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + const int local_index = linear_index % size; + const int strided_index = local_index * stride; + dst[strided_index] = cast<{1}>(src[linear_index]); +}} +)METAL_SCATTER"; + +static const char *GATHER_OPS_TEMPLATE = R"METAL_GATHER( +struct __attribute__ ((packed)) packed_uint5{{ + uint32_t x; uint32_t y; uint32_t z; uint32_t w; uint32_t u; +}}; + +template +Y cast(const X x); + +template<> +{1} cast<{1}, {0}>(const {0} x) {{ + return {2}; +}} + +kernel void gather_kernel_5(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint5 & size [[buffer(2)]], + constant packed_uint5 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + + packed_uint5 local_index; + local_index.x = linear_index / (size.u * size.w * size.z * size.y) % size.x; + local_index.y = linear_index / (size.u * size.w * size.z) % size.y; + local_index.z = linear_index / (size.u * size.w) % size.z; + local_index.w = linear_index / size.u % size.w; + local_index.u = linear_index % size.u; + + packed_uint5 strided_index; + strided_index.x = local_index.x * stride.x; + strided_index.y = local_index.y * stride.y; + strided_index.z = local_index.z * stride.z; + strided_index.w = local_index.w * stride.w; + strided_index.u = local_index.u * stride.u; + + dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y + strided_index.z + strided_index.w + strided_index.u]); +}} + +kernel void gather_kernel_4(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint4 & size [[buffer(2)]], + constant packed_uint4 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + packed_uint4 local_index; + local_index.x = linear_index / (size[3] * size[2] * size[1]) % size[0]; + local_index.y = linear_index / (size[3] * size[2]) % size[1]; + local_index.z = linear_index / size[3] % size[2]; + local_index.w = linear_index % size[3]; + + const packed_uint4 strided_index = local_index * stride; + dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y + strided_index.z + strided_index.w]); +}} + +kernel void gather_kernel_3(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint3 & size [[buffer(2)]], + constant packed_uint3 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + packed_uint3 local_index; + local_index.x = linear_index / (size[2] * size[1]) % size[0]; + local_index.y = linear_index / size[2] % size[1]; + local_index.z = linear_index % size[2]; + + const packed_uint3 strided_index = local_index * stride; + dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y + strided_index.z]); +}} + +kernel void gather_kernel_2(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant packed_uint2 & size [[buffer(2)]], + constant packed_uint2 & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + packed_uint2 local_index; + local_index.x = linear_index / size[1] % size[0]; + local_index.y = linear_index % size[1]; + + const packed_uint2 strided_index = local_index * stride; + dst[linear_index] = cast<{1}>(src[strided_index.x + strided_index.y]); +}} + +kernel void gather_kernel_1(uint linear_index [[thread_position_in_grid]], + constant void * src_ [[buffer(0)]], + device void * dst_ [[buffer(1)]], + constant int & size [[buffer(2)]], + constant int & stride [[buffer(3)]], + constant uint32_t & numel [[buffer(4)]]) {{ + if (linear_index >= numel) return; + + constant {0} * src = (constant {0} *)src_; + device {1} * dst = (device {1} *)dst_; + + const int local_index = linear_index % size; + const int strided_index = local_index * stride; + dst[linear_index] = cast<{1}>(src[strided_index]); +}} +)METAL_GATHER"; +} // namespace at::mps diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h new file mode 100644 index 0000000000000000000000000000000000000000..1dc8c434f85b76c47ec5b8d7355a64ad3e3254ed --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocator.h @@ -0,0 +1,403 @@ +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +// this implementation is based on CUDACachingAllocator. +// It utilizes Metal Heaps to improve the performance with buffer allocation. +// Do not include this header. Use MPSAllocatorInterface.h instead. +// TODO: Unify the logic with CUDACachingAllocator and remove redundant code. +namespace at::mps::HeapAllocator { + +static const size_t kMaxSmallAlloc = MB(1); // largest "small" allocation is 1 MiB +static const size_t kMinLargeAlloc = MB(10); // allocations between 1 and 10 MiB may use kLargeHeap +static const size_t kRoundLarge = MB(2); // round up large allocations to 2 MiB +static const size_t kSmallHeap = MB(8); // "small" allocations are packed in 8 MiB heaps +static const size_t kLargeHeap = MB(32); // "large" allocations may be packed in 32 MiB heaps +static const size_t kXLargeHeapD = MB(128); // "extra large" allocations on Discrete devices may be packed in 128 MiB heaps +static const size_t kXLargeHeapU = MB(1024); // "extra large" allocations on Unified devices may be packed in 1 GiB heaps +static const size_t kMaxScalarAlloc = (sizeof(int64_t)); // largest "scalar" allocation + +// buffer pools could be customized with a combination of usage flags +enum UsageFlags : uint32_t { + PRIVATE = 0, + SMALL = (1 << 0), // small heaps have sizes of kSmallHeap, and large ones kLargeHeap + SHARED = (1 << 1), // shared pools allocated on devices with unified memory; otherwise, private between host/device + MANAGED = (1 << 2), // managed storage mode + HAZARD = (1 << 3), // enables Automatic Hazard Tracking for the resources allocated on the pool + SCALAR = (1 << 4), // used to import CPU scalar values to GPU and use them in MPS Stream +}; +// debug verbosity flags +enum DebugVerbosity : uint32_t { + SILENT = 0, + PROFILING = (1 << 0), // print generic profiling data for total system memory usage + ALLOCATIONS = (1 << 1), // print buffer allocations + RECYCLES = (1 << 2), // print buffer recycling + RELEASES = (1 << 3), // print buffer releases + LARGE_ONLY = (1 << 4), // only log large buffer pool transactions +}; + +struct HeapBlock; + +struct BufferBlock { + id buffer; + void* cpu_ptr = nullptr; // stores the pointer to CPU mapping of a Shared MTLBuffer + size_t size; // size after alignment + size_t requested_size; // requested size (before alignment) + // buffer shape is used for retrieving base of views in cached graphs + std::vector shape; + bool in_use = false; + HeapBlock* heap; + id_t buf_id; + // counter to candidate least recently used buffers for garbage collection + uint32_t gc_count = 0; + uint32_t use_count = 0; + // counter to assign unique ids to buffer blocks + static uint64_t buffer_counter; + // Metal events used to sync GPU/CPU operations on the shared-storage buffers + MPSEventPtr event; + + BufferBlock(size_t Size, size_t RequestedSize = 0, const id Buffer = nullptr, + HeapBlock* Heap = nullptr) : + buffer(Buffer), size(Size), requested_size(RequestedSize), + heap(Heap), buf_id(Buffer ? ++buffer_counter : 0) { } + + static bool Comparator(const BufferBlock* a, const BufferBlock* b) { + return (a->size != b->size) ? a->size < b->size : (uintptr_t)a->buffer < (uintptr_t)b->buffer; + } + static size_t alignUp(size_t Size, size_t Alignment) { + assert(((Alignment - 1) & Alignment) == 0); + return ((Size + Alignment - 1) & ~(Alignment - 1)); + } + uint32_t retainCount() const { return [buffer retainCount]; } +}; +typedef bool (*BufferComparison)(const BufferBlock*, const BufferBlock*); + +struct BufferPool; +struct AllocParams { + AllocParams(size_t Alloc_Size, size_t Requested_Size, BufferPool* Pool) : + search_key(Alloc_Size), pool(Pool), requested_size(Requested_Size) { } + size_t size() const { return search_key.size; } + + BufferBlock search_key; + BufferPool* pool; + BufferBlock* buffer_block = nullptr; + size_t requested_size; + // true if we exceed the low watermark limit. In this case + // we apply strategies to relieve the pressure before allocation. + bool has_memory_pressure = false; + // true if we're allocating on a unified memory device + bool has_unified_memory = true; +}; + +struct HeapBlock { + id heap; + struct { size_t total, available; } size; + BufferPool* pool; + unsigned int n_buffers = 0; + id_t heap_id; + // indicates if we split this heap to sub-allocate 'several' buffers (otherwise single buffer) + bool is_split; + // counter to assign unique ids to heap blocks + static uint64_t heap_counter; + + HeapBlock(size_t Size, const id Heap = nullptr, BufferPool *Pool = nullptr) : + heap(Heap), size({.total = Size, .available = Size}), pool(Pool), + heap_id(Heap ? ++heap_counter : 0), is_split(true) { } + + static MTLResourceOptions getOptions(uint32_t usage) { + // TODO: check the caching performance of write-combined mode + MTLResourceOptions options = MTLResourceCPUCacheModeDefaultCache; + + if (usage & UsageFlags::MANAGED) + options |= MTLResourceStorageModeManaged; + else if (usage & UsageFlags::SHARED) + options |= MTLResourceStorageModeShared; + else + options |= MTLResourceStorageModePrivate; + + options |= (usage & UsageFlags::HAZARD) ? MTLResourceHazardTrackingModeTracked : MTLResourceHazardTrackingModeUntracked; + + return options; + } + + static HeapBlock* createHeapBlock(AllocParams& params, id device, uint32_t usage) { + HeapBlock *heapBlock = nullptr; + bool is_split = true; + const size_t size = params.size(); + MTLHeapDescriptor *d = [MTLHeapDescriptor new]; + if (d) { + const size_t kXLargeHeap = params.has_unified_memory ? kXLargeHeapU : kXLargeHeapD; + if (size <= kMaxSmallAlloc) { + d.size = kSmallHeap; + } else if (size < kMinLargeAlloc) { + d.size = kLargeHeap; + } else if (size < kXLargeHeap / 2 && !params.has_memory_pressure) { + d.size = kXLargeHeap; + } else { + d.size = kRoundLarge * ((size + kRoundLarge - 1) / kRoundLarge); + is_split = false; + } + d.storageMode = (usage & UsageFlags::SHARED) ? MTLStorageModeShared : MTLStorageModePrivate; + d.cpuCacheMode = MTLCPUCacheModeDefaultCache; + // this automatically handles Metal buffer access synchronizations at the + // cost of slightly lower performance. + d.hazardTrackingMode = (usage & UsageFlags::HAZARD) ? MTLHazardTrackingModeTracked : MTLHazardTrackingModeUntracked; + d.resourceOptions = getOptions(usage); + d.type = MTLHeapTypeAutomatic; + id heap = [device newHeapWithDescriptor: d]; + if (heap) { + [heap setPurgeableState:MTLPurgeableStateNonVolatile]; + const size_t heap_size = heapAvailableSize(heap); + heapBlock = new HeapBlock(heap_size, heap, params.pool); + if (heapBlock) { + heapBlock->is_split = is_split; + } + } + [d release]; + } + return heapBlock; + } + static bool Comparator(const HeapBlock* a, const HeapBlock* b) { + return (a->size.available != b->size.available) ? a->size.available < b->size.available : + (uintptr_t)a->heap < (uintptr_t)b->heap; + } + static NSUInteger heapAvailableSize(id heap, size_t Alignment = vm_page_size) { + return [heap maxAvailableSizeWithAlignment:Alignment]; + } + NSUInteger Size() { + return [heap size]; + } + id newMTLBuffer(size_t length, uint32_t usage) { + id buf = [heap newBufferWithLength:length options:getOptions(usage)]; + if (buf) { + updateAvailableSize(); + n_buffers++; + } + return buf; + } + // returns the retainCount before releasing the buffer + uint32_t releaseMTLBuffer(id& buffer) { + const uint32_t retainCount = [buffer retainCount]; + [buffer release]; + buffer = nil; + updateAvailableSize(); + n_buffers--; + return retainCount; + } + // returns the retainCount before releasing the heap + uint32_t releaseMTLHeap() { + const uint32_t retainCount = [heap retainCount]; + TORCH_INTERNAL_ASSERT(!n_buffers); // assert if heap isn't empty + [heap setPurgeableState:MTLPurgeableStateEmpty]; + [heap release]; + heap = nil; + size.available = 0; + return retainCount; + } + uint32_t retainCount() const { return [heap retainCount]; } + void updateAvailableSize() { size.available = heapAvailableSize(heap); } +}; +typedef bool (*HeapComparison)(const HeapBlock*, const HeapBlock*); + +struct BufferPool { + enum class Kind { + PRIVATE_SMALL, + PRIVATE_LARGE, + SHARED_SMALL, + SHARED_LARGE, + SCALAR, + }; + + BufferPool(const id Device, uint32_t Usage) : + device(Device), usage(Usage), + heaps(HeapBlock::Comparator), available_buffers(BufferBlock::Comparator) { } + + const id device; + // usage flags to customize the pool for various purposes (see UsageFlags enum) + const uint32_t usage; + // total number of buffers in the pool + uint32_t n_buffers = 0; + // total allocations size on this pool + size_t allocated_size = 0; + // total memory available in the pool + size_t available_size = 0; + // list of heaps ordered by their "available" (not total) memory size + std::set heaps; + // list of only "available" buffers in the pool (i.e., buffers not in-use) + std::set available_buffers; + // list of buffers that are in a state of "limbo" where they've already been freed + // from PyTorch-side, but were not returned to pool due to still being + // in-use by command buffers with retainCount > 1. In this state, the buffer is + // neither ready to be recycled, nor could be returned to pool as available. + // These buffers will be returned to pool once the command buffer's + // completionHandler callbacks are called. + std::unordered_set buffers_pending_free; + // list of heaps pending size update + std::unordered_set heaps_pending_update; +}; + +class MPSHeapAllocatorImpl { +public: + explicit MPSHeapAllocatorImpl() : + m_device(at::mps::MPSDevice::getInstance()->device()), + m_max_buffer_size([m_device maxBufferLength]), + m_stream(getDefaultMPSStream()), + m_event_pool(getMPSEventPool()) { + init_allocator(); + } + ~MPSHeapAllocatorImpl() { + emptyCache(); + } + // interface exposed to at::Allocator + id malloc(size_t size, uint32_t usage); + // frees a buffer and returns it into buffer pool + void free(void* ptr); + // releases all the cached buffers and their associated heaps + void emptyCache(); + // free inactive buffers that are pending to be freed + void freeInactiveBuffers(); + // returns true if buffer was allocated from the shared pool + bool isSharedBuffer(const void* ptr); + // get the requested unaligned size of an MTLBuffer + ssize_t getUnalignedBufferSize(const void* ptr); + // set the shape of a base tensor from a view tensor + void setBufferShape(const void* ptr, const IntArrayRef& shape); + // retrieve the shape of a base tensor from a view tensor + IntArrayRef getBufferShape(const void* ptr); + // get the unique ID of the buffer + id_t getBufferId(const void* ptr); + // allocate a buffer from a specialized pool to import CPU scalars into GPU + id allocScalarBufferWithValue(void* value, size_t size); + // returns a CPU-mapping of the input buffer and its retainCount, + // if only it has Shared storage-mode and allocated on MPSAllocator + std::pair getSharedBufferPtr(const void* buffer); + // records events for a list of MTLBuffers (list is used to lock the mutex once) + // returns true if records any event (given if passed buffers exist and are shared-storage) + bool recordEvents(c10::ArrayRef buffers); + // waits for the event to signal the completion of GPU execution + // on the passed shared buffers (list is used to lock the mutex once) + // returns true if actually waited on any event + bool waitForEvents(c10::ArrayRef buffers); + // this indicates how far (in Megabytes) the current total allocations are from the + // low watermark limit which is used to detect if we're under memory pressure + // This returns zero if we've reached the low watermark limit + ssize_t getLowWatermarkValue(); + // (see m_low_watermark_ratio for description) + void setLowWatermarkRatio(double ratio); + // (see m_high_watermark_ratio for description) + void setHighWatermarkRatio(double ratio); + // (see m_low_watermark_limit for description) + size_t getLowWatermarkLimit() const { return m_low_watermark_limit; } + // (see m_max_total_allowed_size for description) + size_t getHighWatermarkLimit() const { return m_max_total_allowed_size; } + // (see m_total_allocated_memory for description) + size_t getTotalAllocatedMemory() const { return m_total_allocated_memory; } + // (see m_current_allocated_memory for description) + size_t getCurrentAllocatedMemory() const { return m_current_allocated_memory; } + // total GPU memory allocated in the process by Metal driver; including + // implicit allocations from MPS/MPSGraph frameworks and MPSHeapAllocatorImpl. + size_t getDriverAllocatedMemory() const { return current_allocated_size(); } + // recommended Max memory for Metal + size_t getRecommendedMaxMemory() const { return max_device_size(); } + // (see enum DebugVerbosity for description) + uint32_t getDebugVerbosity() const { return m_debug_verbosity; } + // returns the device that we allocate from + inline id Device() const { return m_device; } + + // TODO: make a common function to do size unit conversions in PyTorch. + inline std::string format_size(uint64_t size) const; + +private: + // (see m_high_watermark_ratio for description) + constexpr static double default_high_watermark_ratio = 1.7; + // we set the allowed upper bound to twice the size of recommendedMaxWorkingSetSize. + constexpr static double default_high_watermark_upper_bound = 2.0; + // (see m_low_watermark_ratio for description) + // on unified memory, we could allocate beyond the recommendedMaxWorkingSetSize + constexpr static double default_low_watermark_ratio_unified = 1.4; + constexpr static double default_low_watermark_ratio_discrete = 1.0; + + const id m_device; + std::recursive_mutex m_mutex; + // allocated buffers by device pointer + ska::flat_hash_map m_allocated_buffers; + // using a container for pools to simplify iterating them + ska::flat_hash_map> m_pools; + // total memory allocated by HeapAllocator (including blocks in pools) + size_t m_total_allocated_memory = 0; + // currently active memory allocations in use (i.e., blocks not in pools) + size_t m_current_allocated_memory = 0; + // max buffer size allowed by Metal + size_t m_max_buffer_size = 0; + // maximum total size allowed to be allocated + size_t m_max_total_allowed_size = 0; + // high watermark ratio is a hard limit for the total allowed allocations + // 0. : disables high watermark limit (may cause system failure if system-wide OOM occurs) + // 1. : recommended maximum allocation size (i.e., device.recommendedMaxWorkingSetSize) + // >1.: allows limits beyond the device.recommendedMaxWorkingSetSize + // e.g., value 0.95 means we allocate up to 95% of recommended maximum + // allocation size; beyond that, the allocations would fail with OOM error. + double m_high_watermark_ratio; + // low watermark ratio is a soft limit to attempt limiting memory allocations up to the lower watermark + // level by garbage collection or committing command buffers more frequently (a.k.a, adaptive commit). + // Value between 0 to m_high_watermark_ratio (setting 0.0 disables adaptive commit and garbage collection) + // e.g., value 0.9 means we 'attempt' to limit allocations up to 90% of recommended maximum + // allocation size. + double m_low_watermark_ratio; + // low watermark size limit (in Bytes) at the time we initialize the allocator + size_t m_low_watermark_limit; + // use "PYTORCH_DEBUG_MPS_ALLOCATOR" env-var to set debug verbosity + uint32_t m_debug_verbosity; + // default MPS stream + MPSStream* m_stream; + // we hold a reference to MPSEventPool so it could get destroyed after MPSAllocator + std::shared_ptr m_event_pool; + + void init_allocator(); + void init_buffer_pools(); + HeapBlock* get_free_heap(AllocParams& params); + bool get_free_buffer(AllocParams& params); + BufferBlock* get_allocated_buffer_block(const void* ptr); + BufferBlock* alloc_buffer_block(size_t size, uint32_t usage); + bool alloc_buffer(AllocParams& params); + void free_buffer(BufferBlock* buffer_block); + // returns true if the container heap is also released + bool release_buffer(BufferBlock* buffer_block, bool remove_empty_heap = true); + void release_buffers(BufferPool& pool); + bool release_available_cached_buffers(AllocParams& params); + bool release_cached_buffers(); + // free unused cached blocks to reclaim GPU memory if memory pressure is high + void garbage_collect_cached_buffers(AllocParams& params); + // returns the suitable buffer pool type for the usage or + // requested/allocated sizes + BufferPool& get_pool(size_t requested_size, size_t aligned_size, uint32_t usage); + // returns the aligned allocation size that is optimized + // for the buffers to get reused frequently + size_t get_allocation_size(size_t size, uint32_t usage) const; + // maximum size of device memory available for allocation in current process + // Note: the recommendedMaxWorkingSetSize is typically 75% of the total system memory. + size_t max_device_size() const { return [m_device recommendedMaxWorkingSetSize]; } + // there are implicit allocations from MPS backend, so we need to query the 'device' for + // total allocated size instead of manually tracking in MPSAllocator + size_t current_allocated_size() const { return [m_device currentAllocatedSize]; } + + bool trigger_memory_callbacks(BufferBlock* buffer_block, IMpsAllocatorCallback::EventType event) const { + for (const auto& name : MPSAllocatorCallbacksRegistry()->Keys()) { + MPSAllocatorCallbacksRegistry()->Create(name)->executeMPSAllocatorCallback(buffer_block ? buffer_block->buffer : nullptr, event); + } + return true; + } +}; + +} // namespace at::mps::HeapAllocator diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h new file mode 100644 index 0000000000000000000000000000000000000000..9aa4769f76ed68669313be64ed75177a743df5bd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSAllocatorInterface.h @@ -0,0 +1,64 @@ +// Copyright © 2023 Apple Inc. + +#pragma once + +#include +#include +#include + +#define MB(x) (x * 1048576UL) + +namespace at::mps { + +// this is a public interface to access MPSAllocator. +// Do not declare methods that would depend on MPS or Metal frameworks. +class IMPSAllocator : public c10::Allocator { +public: + // see the comments in MPSAllocator.h for the description of these methods. + virtual void emptyCache() const = 0; + virtual void freeInactiveBuffers() const = 0; + virtual ssize_t getUnalignedBufferSize(const void* ptr) const = 0; + virtual IntArrayRef getBufferShape(const void* ptr) const = 0; + virtual id_t getBufferId(const void* ptr) const = 0; + virtual void setBufferShape(const void* ptr, const IntArrayRef& shape) const = 0; + virtual bool isSharedBuffer(const void* ptr) const = 0; + virtual bool isSharedStorageSupported() const = 0; + virtual c10::DataPtr allocScalarBufferWithValue(void* value, size_t size) const = 0; + virtual std::string formatSize(size_t size) const = 0; + virtual void setLowWatermarkRatio(double ratio) const = 0; + virtual void setHighWatermarkRatio(double ratio) const = 0; + virtual ssize_t getLowWatermarkValue() const = 0; + virtual size_t getLowWatermarkLimit() const = 0; + virtual size_t getHighWatermarkLimit() const = 0; + virtual size_t getTotalAllocatedMemory() const = 0; + virtual size_t getCurrentAllocatedMemory() const = 0; + virtual size_t getDriverAllocatedMemory() const = 0; + virtual size_t getRecommendedMaxMemory() const = 0; + virtual std::pair getSharedBufferPtr(const void* ptr) const = 0; + virtual bool recordEvents(c10::ArrayRef buffers) const = 0; + virtual bool waitForEvents(c10::ArrayRef buffers) const = 0; +}; + +class IMpsAllocatorCallback { + public: + enum class EventType { + ALLOCATED, // buffer got allocated to be used immediately + RECYCLED, // buffer pulled from free list to be reused + FREED, // buffer put to free list for future recycling + RELEASED, // buffer memory released + ALLOCATION_FAILED // buffer allocation failed + }; + virtual ~IMpsAllocatorCallback() = default; + virtual void executeMPSAllocatorCallback(void* ptr, EventType event) = 0; +}; + +// MPS allocator will execute every registered callback when a block of memory is freed. +C10_DECLARE_REGISTRY(MPSAllocatorCallbacksRegistry, IMpsAllocatorCallback); +#define REGISTER_MPS_ALLOCATOR_CALLBACK(name, ...) \ + C10_REGISTER_CLASS(MPSAllocatorCallbacksRegistry, name, __VA_ARGS__); + +IMPSAllocator* getIMPSAllocator(bool sharedAllocator = false); + +bool isMPSPinnedPtr(const void* data); + +} // namespace at::mps diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSDevice.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSDevice.h new file mode 100644 index 0000000000000000000000000000000000000000..1a7df3ba3620c66a7c2f490090df6e171224646c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSDevice.h @@ -0,0 +1,84 @@ +// Copyright © 2022 Apple Inc. + +#pragma once +#include +#include +#include + + +#ifdef __OBJC__ +#include +#include +#include +typedef id MTLDevice_t; +typedef id MTLLibrary_t; +typedef id MTLComputePipelineState_t; +typedef id MTLLibrary_t; +#else +typedef void* MTLDevice; +typedef void* MTLDevice_t; +typedef void* MTLLibrary_t; +typedef void* MTLComputePipelineState_t; +typedef void* MTLLibrary_t; +#endif + +namespace at::mps { + +// Helper enum to check if a MPSGraph op is supported in a given macOS version +enum class MacOSVersion : uint32_t { + MACOS_VER_13_1_PLUS = 0, + MACOS_VER_13_2_PLUS, + MACOS_VER_13_3_PLUS, + MACOS_VER_14_0_PLUS, + MACOS_VER_14_4_PLUS, + MACOS_VER_15_0_PLUS, +}; + +//----------------------------------------------------------------- +// MPSDevice +// +// MPSDevice is a singleton class that returns the default device +//----------------------------------------------------------------- + +class TORCH_API MPSDevice { + public: + /** + * MPSDevice should not be cloneable. + */ + MPSDevice(MPSDevice& other) = delete; + /** + * MPSDevice should not be assignable. + */ + void operator=(const MPSDevice&) = delete; + /** + * Gets single instance of the Device. + */ + static MPSDevice* getInstance(); + /** + * Returns the single device. + */ + MTLDevice_t device() { + return _mtl_device; + } + /** + * Returns whether running on Ventura or newer + */ + bool isMacOS13Plus(MacOSVersion version) const; + + MTLComputePipelineState_t metalIndexingPSO(const std::string &kernel); + MTLLibrary_t getMetalIndexingLibrary(); + + ~MPSDevice(); + + private: + static MPSDevice* _device; + MTLDevice_t _mtl_device; + MTLLibrary_t _mtl_indexing_library; + MPSDevice(); +}; + +TORCH_API bool is_available(); +TORCH_API bool is_macos_13_or_newer(MacOSVersion version); +TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false); + +} // namespace at::mps diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h new file mode 100644 index 0000000000000000000000000000000000000000..880ff1c75d12e17ecf719f3de875c2217f852e51 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSEvent.h @@ -0,0 +1,100 @@ +// Copyright © 2023 Apple Inc. + +#pragma once + +#include +#include +#include + +namespace at::mps { + +// NOTE: don't create instances of this class directly. +// Use MPSEventPool to acquire instances of MPSEvent. +class MPSEvent { +public: + explicit MPSEvent(id_t ID, MPSStream* stream, bool enable_timing); + ~MPSEvent(); + + // records an event on the stream + void record(bool needsLock, bool syncEvent = false); + // makes all future work submitted to the stream wait for this event. + bool wait(bool needsLock, bool syncEvent = false); + // schedules a notifyListener callback for the event. + bool notify(bool needsLock, MTLSharedEventNotificationBlock block); + // checks if events are already signaled. + bool query() const; + // blocks the CPU thread until all the GPU work that were scheduled + // prior to recording this event are completed. + bool synchronize(); + // resets this event with new parameters in case it gets reused from the event pool + void reset(MPSStream* stream, bool enable_timing); + // returns the unique ID of the event instance + id_t getID() const { return m_id; } + // returns the completion timestamp of the event + uint64_t getCompletionTime() const { return m_completion_time; } + // if already recorded, waits for cpu_sync_cv to be signaled + void waitForCpuSync(); + +private: + id_t m_id; + // enables measuring the completion time of the notifyListener of this event + bool m_enable_timing; + uint64_t m_signalCounter = 0; + MPSStream* m_stream = nullptr; + MTLSharedEvent_t m_event = nullptr; + MTLSharedEventListener* m_listener = nullptr; + // used to sync the events created on this Stream with CPU + std::mutex m_cpu_sync_mutex{}; + std::condition_variable m_cpu_sync_cv{}; + // CondVar predicate to sync the events created on this Stream with CPU + bool m_cpu_sync_completed = false; + // used to compute elapsed time + uint64_t m_completion_time = 0; + + void recordLocked(bool syncEvent); + bool waitLocked(bool syncEvent); + bool notifyLocked(MTLSharedEventNotificationBlock block); + void notifyCpuSync(); + static uint64_t getTime() { + return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW); + } +}; + +typedef std::unique_ptr> MPSEventPtr; + +class MPSEventPool { +public: + explicit MPSEventPool(MPSStream* default_stream); + ~MPSEventPool(); + + MPSEventPtr acquireEvent(bool enable_timing, MPSStream* stream); + void emptyCache(); + + // these are mainly used for MPSHooks and torch.mps.Event() bindings + id_t acquireEvent(bool enable_timing); + void releaseEvent(id_t event_id); + void recordEvent(id_t event_id, bool syncEvent); + void waitForEvent(id_t event_id, bool syncEvent); + void synchronizeEvent(id_t event_id); + bool queryEvent(id_t event_id); + // returns elapsed time between two recorded events in milliseconds + double elapsedTime(id_t start_event_id, id_t end_event_id); + +private: + MPSStream* m_default_stream = nullptr; + std::recursive_mutex m_mutex; + std::stack> m_pool{}; + // dictionary to associate event IDs with event objects + // used to retain in-use events out of the pool + // for torch.mps.Event() bindings. + std::unordered_map m_in_use_events{}; + uint64_t m_event_counter = 0; + std::function m_default_deleter; + + MPSEvent* getInUseEvent(id_t event_id, bool locked = true); +}; + +// shared_ptr is used to get MPSEventPool destroyed after dependent instances +std::shared_ptr getMPSEventPool(); + +} // namespace at::mps diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSGeneratorImpl.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSGeneratorImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..98cb66207ab435c00d10366f24f967e3aa30eff8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSGeneratorImpl.h @@ -0,0 +1,52 @@ +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#include +#include +#include + +namespace at { +namespace mps::detail { + +constexpr uint32_t PHILOX_STATE_N = 7; +struct rng_data_pod { + std::array state{1}; + uint64_t seed = default_rng_seed_val; +}; + +TORCH_API const Generator& getDefaultMPSGenerator(); +TORCH_API Generator createMPSGenerator(uint64_t seed_val = default_rng_seed_val); + +} // namespace mps::detail + +struct TORCH_API MPSGeneratorImpl : public c10::GeneratorImpl { + // Constructors + MPSGeneratorImpl(uint64_t seed_in = default_rng_seed_val); + ~MPSGeneratorImpl() override = default; + + // MPSGeneratorImpl methods + std::shared_ptr clone() const; + void set_current_seed(uint64_t seed) override; + void set_offset(uint64_t offset) override; + uint64_t get_offset() const override; + uint64_t current_seed() const override; + uint64_t seed() override; + void set_state(const c10::TensorImpl& new_state) override; + c10::intrusive_ptr get_state() const override; + void update_philox_counters(); + + void set_engine(at::Philox4_32 engine) { engine_ = engine; }; + at::Philox4_32 engine() { return engine_; }; + uint32_t* state_data() { return data_.state.data(); } + static DeviceType device_type() { return DeviceType::MPS; }; + +private: + mps::detail::rng_data_pod data_; + at::Philox4_32 engine_; + + MPSGeneratorImpl* clone_impl() const override; +}; + +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSGuardImpl.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSGuardImpl.h new file mode 100644 index 0000000000000000000000000000000000000000..cb50df2faeaeedc91087e6f90861835cbb5765f8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSGuardImpl.h @@ -0,0 +1,179 @@ +// Copyright © 2022 Apple Inc. + +#pragma once +#include +#include +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at::mps { + +typedef MPSEvent* mpsEvent_t; + +// TODO: Move the MPSGuardImpl to inherit from NoOpDeviceGuardImpl +// https://github.com/pytorch/pytorch/issues/77170 +struct TORCH_API MPSGuardImpl final : public c10::impl::DeviceGuardImplInterface { + static constexpr c10::DeviceType static_type = c10::DeviceType::MPS; + + // constructor + MPSGuardImpl() {} + explicit MPSGuardImpl(c10::DeviceType t) { + TORCH_INTERNAL_ASSERT(t == c10::DeviceType::MPS); + } + + // returns the type + c10::DeviceType type() const override { + return c10::DeviceType::MPS; + } + + Device exchangeDevice(Device d) const override { + return Device(c10::DeviceType::MPS, 0); + } + + Device getDevice() const override { + return Device(c10::DeviceType::MPS, 0); + } + + std::optional uncheckedGetDevice() const noexcept { + return Device(c10::DeviceType::MPS, 0); + } + + void setDevice(Device d) const override { + TORCH_INTERNAL_ASSERT(d.is_mps()); + } + + void uncheckedSetDevice(Device d) const noexcept override { + // TODO: Currently setting only device 0 + } + + Stream getStream(Device d) const noexcept override { + return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0)); + } + + Stream getNewStream(Device, int priority = 0) const override { + (void)priority; + return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0)); + } + + Stream getDefaultStream(Device d) const override { + return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0)); + } + + // NB: These do NOT set the current device + Stream exchangeStream(Stream s) const noexcept override { + return Stream(Stream::DEFAULT, Device(c10::DeviceType::MPS, 0)); + } + DeviceIndex deviceCount() const noexcept override { + if (at::hasMPS()) { + //TODO: extend it for multi-device case + return 1; + } else { + return 0; + } + } + + // Event-related functions + void createEvent( + mpsEvent_t* event, + const EventFlag flag) const; + + void destroyEvent( + void* event, + const DeviceIndex device_index) const noexcept override; + + void record( + void** event, + const Stream& stream, + const DeviceIndex device_index, + const EventFlag flag) const override; + + void block( + void* event, + const Stream& stream) const override; + + bool queryEvent(void* event) const override; + +}; + +/// A variant of OptionalDeviceGuard that is specialized for MPS. +struct OptionalMPSGuard { + explicit OptionalMPSGuard() : guard_() {} + + explicit OptionalMPSGuard(std::optional device_opt) + : guard_(device_opt) {} + + /// Set the current MPS device to the passed device index, if it is not + /// nullopt + explicit OptionalMPSGuard(std::optional device_index_opt) + : guard_(device_index_opt) {} + + // Copy is not allowed + OptionalMPSGuard(const OptionalMPSGuard&) = delete; + OptionalMPSGuard& operator=(const OptionalMPSGuard&) = delete; + OptionalMPSGuard(OptionalMPSGuard&& other) = delete; + OptionalMPSGuard& operator=(OptionalMPSGuard&& other) = delete; + + /// Sets the MPS device to the given device, initializing the guard if it + /// is not already initialized. Errors if the given device is not a MPS + /// device. + void set_device(Device device) { + guard_.set_device(device); + } + + /// Sets the MPS device to the given device, initializing the guard if it is + /// not already initialized. Errors if the given device is not a MPS device. + void reset_device(Device device) { + guard_.reset_device(device); + } + + /// Sets the MPS device to the given device index, initializing the guard if + /// it is not already initialized. + void set_index(DeviceIndex device_index) { + guard_.set_index(device_index); + } + + /// Returns the device that was set immediately prior to initialization of the + /// guard, or nullopt if the guard is uninitialized. + std::optional original_device() const { + return guard_.original_device(); + } + + /// Returns the most recent device that was set using this device guard, + /// either from construction, or via set_device, if the guard is initialized, + /// or nullopt if the guard is uninitialized. + std::optional current_device() const { + return guard_.current_device(); + } + + /// Restore the original MPS device, resetting this guard to uninitialized + /// state. + void reset() { + guard_.reset(); + } + + private: + c10::impl::InlineOptionalDeviceGuard guard_; +}; + + +C10_REGISTER_GUARD_IMPL(MPS, MPSGuardImpl); + +} // namespace at::mps diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSHooks.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSHooks.h new file mode 100644 index 0000000000000000000000000000000000000000..4858c0609f56b3e92ac0c6a5a012b20ff736e2fb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSHooks.h @@ -0,0 +1,60 @@ +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#include +#include +#include + +namespace at::mps { + +// The real implementation of MPSHooksInterface +struct MPSHooks : public at::MPSHooksInterface { + MPSHooks(at::MPSHooksArgs) {} + void initMPS() const override; + + // MPSDevice interface + bool hasMPS() const override; + bool isOnMacOSorNewer(unsigned major, unsigned minor) const override; + + // MPSGeneratorImpl interface + const Generator& getDefaultMPSGenerator() const override; + + // MPSStream interface + void deviceSynchronize() const override; + void commitStream() const override; + void* getCommandBuffer() const override; + void* getDispatchQueue() const override; + + // MPSAllocator interface + Allocator* getMPSDeviceAllocator() const override; + void emptyCache() const override; + size_t getCurrentAllocatedMemory() const override; + size_t getDriverAllocatedMemory() const override; + size_t getRecommendedMaxMemory() const override; + void setMemoryFraction(double ratio) const override; + bool isPinnedPtr(const void* data) const override; + Allocator* getPinnedMemoryAllocator() const override; + + // MPSProfiler interface + void profilerStartTrace(const std::string& mode, bool waitUntilCompleted) const override; + void profilerStopTrace() const override; + + // MPSEvent interface + uint32_t acquireEvent(bool enable_timing) const override; + void releaseEvent(uint32_t event_id) const override; + void recordEvent(uint32_t event_id) const override; + void waitForEvent(uint32_t event_id) const override; + void synchronizeEvent(uint32_t event_id) const override; + bool queryEvent(uint32_t event_id) const override; + double elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event_id) const override; + + // Compatibility with Accelerator API + bool hasPrimaryContext(DeviceIndex device_index) const override { + // When MPS is available, it is always in use for the one device. + return true; + } +}; + +} // namespace at::mps diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h new file mode 100644 index 0000000000000000000000000000000000000000..7ee9db5dd3242c72a93d2e4ebdf1a41670f98f48 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSProfiler.h @@ -0,0 +1,402 @@ +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace at::mps { + +namespace Profiler { + +struct BaseInfo { + // profiling info types + enum class Type { + GRAPH, + KERNEL, + COPY, + CPU_FALLBACK, + }; + + BaseInfo(Type infoType, uint64_t Id, const uintptr_t Handle) : + type(infoType), profileId(Id), handle(Handle) { } + virtual ~BaseInfo() = default; + + // type of profiling info + Type type; + // unique profile ID for execution instances of operations or copies + uint64_t profileId; + // ID generated by os_signpost + // since it's possible to use event and interval-based signposts at the + // same time, we need separate IDs for each. + os_signpost_id_t eventSignpostId = 0, intervalSignpostId = 0; + // accumulated GPU time in ms (obtained from CompletionHandler's "GPUEndTime - GPUStartTime") + std::atomic totalGpuTime{0.0}; + // accumulated Scheduling time in ms (obtained from CompletionHandler's "KernelEndTime - KernelStartTime") + std::atomic totalSchedulingTime{0.0}; + // indicates if the operation or copy execution has completed + std::atomic_bool completed{false}; + // handle used to identify the profile info's instance (usually the pointer) + const uintptr_t handle; + + virtual const std::string toString(double gpuTime = 0, double schedulingTime = 0) const; + // builds a string for a tensor (format: Device:ScalarType[tensor.sizes()]) + static std::string buildTensorString(const Tensor& tensor, bool includeBufferId = false) { + if (tensor.defined()) { + std::stringstream tensorStr; + auto deviceType = tensor.device().type(); + tensorStr << c10::DeviceTypeName(deviceType); + // see comments for INCLUDE_BUFFER_ID + if (includeBufferId && deviceType == at::kMPS) { + id buffer = __builtin_bit_cast(id, tensor.storage().data()); + tensorStr << "(buf#" << (getIMPSAllocator()->getBufferId(buffer)) + << ":" << buffer.retainCount << ")"; + } + tensorStr << ":" + << tensor.scalar_type() << tensor.sizes(); + return tensorStr.str(); + } else { + return "undefined"; + } + } + static uint64_t getTime() { + return clock_gettime_nsec_np(CLOCK_MONOTONIC_RAW); + } +}; + +struct OperationInfo : BaseInfo { + OperationInfo(const void* Handle, bool IsGraph, uint64_t Id, const std::string& StrKey) : + BaseInfo(IsGraph ? Type::GRAPH : Type::KERNEL, Id, uintptr_t(Handle)), strKey(StrKey) { } + + uint64_t runCount = 0; + std::string strKey; + + const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override; + + // builds a string for a kernel + static std::string buildKernelString(const std::string& kernelName, + const TensorList& tensors, + bool includeBufferId = false) { + std::stringstream kernelStr; + kernelStr << kernelName; + for (const Tensor& tensor: tensors) { + kernelStr << ":" << BaseInfo::buildTensorString(tensor, includeBufferId); + } + return kernelStr.str(); + } +}; + +struct CpuFbInfo : BaseInfo { + CpuFbInfo(uint64_t Id, const std::string& OpName) : + BaseInfo(Type::CPU_FALLBACK, Id, 0), opName(OpName) { } + + uint64_t runCount = 0; + // the current and total overhead of copies in bytes required to convert the Op's + // input tensors from MPS to CPU and then output from CPU back to MPS + size_t currentCopyOverhead = 0; + size_t totalCopyOverhead = 0; + std::string opName; + std::string strKey; + uint64_t startTime = 0; + + const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override; + + void updateCopyOverhead(const TensorList& tensors) { + currentCopyOverhead = 0; + for (const Tensor& tensor: tensors) { + if (tensor.defined()) { + currentCopyOverhead += tensor.nbytes(); + } + } + totalCopyOverhead += currentCopyOverhead; + } +}; + +struct CopyInfo : BaseInfo { + enum class Kind { + MPS_TO_MPS, + MPS_TO_CPU, + CPU_TO_MPS, + }; + + CopyInfo(const void* Handle, size_t Length, uint64_t Id, bool IsNonBlocking, bool UsesBlitter) : + BaseInfo(Type::COPY, Id, uintptr_t(Handle)), kind(Kind::MPS_TO_MPS), + length(Length), isNonBlocking(IsNonBlocking), usesBlitter(UsesBlitter) { } + + Kind kind; + size_t length; + bool isNonBlocking; + bool usesBlitter; + std::string srcStrKey; + std::string dstStrKey; + // for copies that don't use blitters, we measure CPU time + uint64_t startTime = 0; + + const std::string toString(double gpuTime = 0, double schedulingTime = 0) const override; + + static std::string buildTensorString(const void* buffer, const OptionalTensorRef tensor, bool includeBufferId = false); + + static bool isStorageOnMPS(const void* buffer, const OptionalTensorRef tensor) { + if (tensor.has_value()) { + return tensor->device().type() == at::kMPS; + } + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(buffer); + // getUnalignedBufferSize() returns -1 if input buffer is not on MPS device + return getIMPSAllocator()->getUnalignedBufferSize(buffer) >= 0; + } + + static Kind getCopyKind(const void* srcBuffer, const void* dstBuffer, + const OptionalTensorRef srcTensor, const OptionalTensorRef dstTensor) { + const bool isSrcOnMPS = isStorageOnMPS(srcBuffer, srcTensor); + const bool isDstOnMPS = isStorageOnMPS(dstBuffer, dstTensor); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY(isSrcOnMPS || isDstOnMPS); + if (isSrcOnMPS && !isDstOnMPS) { + return Kind::MPS_TO_CPU; + } else if (!isSrcOnMPS && isDstOnMPS) { + return Kind::CPU_TO_MPS; + } + return Kind::MPS_TO_MPS; + } +}; + +struct CopyStat : CopyInfo { + explicit CopyStat(std::string CopyKindStr) : + CopyInfo(nullptr, 0, 0, false, false), kindStr(std::move(CopyKindStr)) {} + // total number of copies + size_t totalCount = 0; + // number of Scalar copies (i.e., less than sizeof(int64)) + size_t scalarsCount = 0; + // number of blocking copies (i.e., require syncing to GPU) + size_t blockingCount = 0; + // number of copies that used memcpy(), instead of Metal Blit Encoder + size_t memcpyCount = 0; + // accumulated GPU time in ms for the scalar copies + std::atomic scalarsGpuTime{0.0}; + // copy kind in string type + std::string kindStr; +}; + +class MPSProfiler { +public: + // lower 16 bits used for profiler options + enum ProfileOptions : uint32_t { + OPTIONS_NONE = 0, + // ALL_* means, all signpost types (RUN_OPERATION|BLIT_COPY|CPU_FALLBACK, etc.) + // (used for convenience to not compute bit flags by OR-ing manually) + // trace all signpost types using events + ALL_SIGNPOST_EVENTS = (1 << 0), + // trace all signpost types using intervals + ALL_SIGNPOST_INTERVALS = (1 << 1), + // always wait for command buffer to finish executing after each commit + WAIT_UNTIL_COMPLETED = (1 << 2), + // for interval-based signposts, include the scheduling portion of + // Graph/Kernel/Copy executions as well. + // if flag is disable, only "GPU run time" is included in interval, + // and not schedule time. + INCLUDE_SCHEDULE_INTERVAL = (1 << 3), + + // use these if you need to trace signposts types individually (rarely required) + // trace signpost using intervals + USE_INTERVALS = (1 << 4), + // trace signpost by emitting events + USE_EVENTS = (1 << 5), + // used for sanity check (Change this when new option added) + OPTIONS_COUNT = (USE_EVENTS << 1) - 1, + }; + + // when adding new types, #define the type string in MPSProfiler.mm as well. + // upper 16 bits used for event types + enum SignpostTypes : uint32_t { + SIGNPOST_NONE = 0, + // trace signposts for PyTorch operation executions + RUN_OPERATION = (1 << 16), + // trace signposts for blitter copies + BLIT_COPY = (1 << 17), + // trace signposts for ops that fall back on CPU + CPU_FALLBACK = (1 << 18), + // used for sanity check (Change this when new type added) + SIGNPOST_COUNT = (CPU_FALLBACK << 1) - 1, + }; + + enum LogOptions : uint32_t { + LOG_NONE = 0, + + // Info logging options during execution + // ------------------------------------- + // prints operation info (id/key/run_count) during execution + OPERATION_INFO = (1 << 0), + // prints copy info (src/dst tensors/buffers, size, etc.) during execution + COPY_INFO = (1 << 1), + // prints CPU Fallback info (id/runCount/opName/copyOverhead) during execution + CPU_FALLBACK_INFO = (1 << 2), + + // Profiling Statistics logging options when process terminates + // ------------------------------------------------------------ + // prints all stats (OPERATION_STATS, COPY_STATS, CPU_FALLBACK_STATS) before process terminates + // this is convenient to not combine following stats bit flags manually + ALL_STATS = (1 << 3), + // prints operation stats (GPU times, run count, etc.) before process terminates + OPERATION_STATS = (1 << 4), + // prints copies stats (GPU times, copy kinds, sizes, etc.) before process terminates + COPY_STATS = (1 << 5), + // prints CPU Fallback stats (CPU times, run times, size of MPS<->CPU copies + // for tensors, etc.) before process terminates + CPU_FALLBACK_STATS = (1 << 6), + + // Metadata format options when logging the info + // --------------------------------------------- + // if enabled, includes GPU run time in metadata (i.e., GPUEndTime-GPUStartTime + // from Metal Command Buffers) (e.g., [GPU=0.324 ms]) + INCLUDE_GPU_TIME = (1 << 7), + // if enabled, includes GPU scheduling time in metadata separately + // (i.e., KernelEndTime-KernelStartTime from Metal Command Buffers) + // e.g., [GPU=0.324 ms, KRNL=0.036 ms] + INCLUDE_KERNEL_TIME = (1 << 8), + // if enabled, includes the unique buffer ID in metadata for the storage + // of a tensor that was allocated on MPSAllocator. This is useful (along with + // the EV "PYTORCH_DEBUG_MPS_ALLOCATOR") to identify buffers that are involved + // with various operations. + INCLUDE_BUFFER_ID = (1 << 9), + + // used for sanity check (Change this when new option added) + LOG_COUNT = (INCLUDE_BUFFER_ID << 1) - 1, + }; + + explicit MPSProfiler(); + ~MPSProfiler(); + + // the handle is either "MPSGraph*" or "id" for Metal Kernels + // the beginProfile*() functions return a profileId which is unique per graph/kernel/copy + uint64_t beginProfileKernel(const void* handle, const std::string& strKey, bool isGraph); + uint64_t beginProfileKernel(const void* handle, const std::string& kernelName, const TensorList& tensors); + uint64_t beginProfileCopy(const void* srcBuffer, const void* dstBuffer, + const OptionalTensorRef srcTensor, + const OptionalTensorRef dstTensor, + size_t length, bool isNonBlocking, bool usesBlitter = true); + uint64_t beginProfileCPUFallback(const std::string& opName, const TensorList& tensors); + void beginProfileGPUInterval(const void* handle); + + void endProfileCopy(uint64_t profileId, SyncType syncType); + void endProfileKernel(const void* handle, SyncType syncType = SyncType::NONE); + void endProfileCPUFallback(const std::string& opName); + + // these are used to hook into Python bindings for torch.mps.profiler module. + // this enables generating OS Signpost traces from MPSProfiler on-demand + // during runtime (instead of environment variables). + // The "mode" could be either "interval", "event", or both "interval,event" + // for interval-based and/or event-based signpost tracing. + void StartTrace(const std::string& mode, bool waitUntilCompleted); + void StopTrace(); + + // Abstractions for GPU trace capturing + bool isCaptureEnabled() const; + bool isCapturing() const; + void startCapture(const std::string& name, MPSStream* stream = nullptr); + void stopCapture(MPSStream* stream = nullptr); + + // convenience functions to indicate whether signpost tracing or + // logging are enabled for the SignpostTypes + bool isOperationProfilingEnabled() const { + return (m_signpost_types & SignpostTypes::RUN_OPERATION) || + (m_log_options & (LogOptions::OPERATION_INFO | LogOptions::OPERATION_STATS)); + } + bool isCopyProfilingEnabled() const { + return (m_signpost_types & SignpostTypes::BLIT_COPY) || + (m_log_options & (LogOptions::COPY_INFO | LogOptions::COPY_STATS)); + } + bool isCPUFallbackProfilingEnabled() const { + return (m_signpost_types & SignpostTypes::CPU_FALLBACK) || + (m_log_options & (LogOptions::CPU_FALLBACK_INFO | LogOptions::CPU_FALLBACK_STATS)); + } + bool isSignpostTracingEnabled() const { + return (m_signpost_types != SignpostTypes::SIGNPOST_NONE); + } + + private: + // indicates what type of signpost types are enabled and traced by MPS profiler. + uint32_t m_signpost_types = 0; + uint32_t m_profile_options = 0; + uint32_t m_log_options = 0; + uint64_t m_kernel_counter = 0; + uint64_t m_graph_counter = 0; + uint64_t m_cpu_fb_counter = 0; + uint64_t m_copy_counter = 0; + // technically, it's possible to trace both events and intervals at the same time + // so we use separate os_log categories for them + os_log_t m_os_log_events; + os_log_t m_os_log_intervals; + // stats logging could run either from destructor or signal handler + // so this is used to check if logging has already started. + std::atomic_bool hasLoggedStats{false}; + // indicates there are pending completionHandler callbacks that haven't been called yet. + std::atomic_bool hasPendingCompletionHandlers{false}; + // used to capture sigint signal to log profiling stats + static struct sigaction currentSigint, previousSigint; + + // We use the following lists for two reasons: + // 1- for interval-based signposts the "begin" point won't be in same function + // as the "end" point where we need to be able to retrieve signpost's info + // 2- if Operations info need to be logged when process ends using LogOptions::OPERATION_INFO. + + // the pointer key for this map is either "MPSGraph*" or "id" for Metal Kernels + // this list is retained and could be logged along with aggregate profiling numbers when the process ends. + std::unordered_map> m_op_info_list{}; + // the string key for this map is the op name that we fall back to execute on CPU + // this list is retained and could be logged along with aggregate profiling numbers when the process ends. + std::unordered_map> m_cpu_fb_info_list{}; + // this list contains the info for copies, and its key is the unique profileId + // which is generated from m_copy_counter + // The copyInfo list is not retained. + std::unordered_map> m_copy_info_list{}; + // a short list that contains copy stats + std::unordered_map> m_copy_stat_list{}; + + mutable MTLCaptureManager *captureManager = nil; + unsigned captureCount = 0; + + void initialize(); + void beginProfileExecution(BaseInfo& info, bool cpuExecution = false); + void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id, + os_signpost_id_t interval_signpost_id, + double gpuTime, double schedulingTime); + void addProfilerScheduledHandler(BaseInfo& info); + void addProfilerCompletedHandler(BaseInfo& info, SyncType syncType); + void emitSignpostEvent(SignpostTypes signpost_type, os_signpost_id_t signpost_id, + const std::string& msg) const; + void beginSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id, + const std::string& msg) const; + void endSignpostInterval(SignpostTypes signpost_type, os_signpost_id_t signpost_id) const; + + void updateCopyStats(const CopyInfo& copyInfo, double gpuTime, double schedulingTime); + // returns true if logging the profiling info "during the execution" is enabled + bool isProfileInfoLoggingEnabled(BaseInfo::Type infoType, bool isExecutionEnded); + // logs all the profiling stats that are enabled + void logProfilingStats(); + // logs kernel profiling stats when the process ends. + void logOperationsProfilingStats(std::FILE* f) const; + // logs CPU Fallback profiling stats when the process ends. + void logCPUFallbackProfilingStats(std::FILE* f) const; + // logs copy profiling stats when the process ends. + void logCopyProfilingStats(std::FILE* f) const; + + os_signpost_id_t generateSignpostId(os_signpost_type_t signpostType, const void* ptr = nullptr); + static SignpostTypes getSignpostType(BaseInfo::Type infoType); + static void handleIntSignal(int signal); +}; + +} // namespace Profiler + +Profiler::MPSProfiler& getMPSProfiler(); + +} // namespace at::mps diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSStream.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSStream.h new file mode 100644 index 0000000000000000000000000000000000000000..fbaa055109042da726dd7d54a34d0d2ed4015458 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/mps/MPSStream.h @@ -0,0 +1,133 @@ +// Copyright © 2022 Apple Inc. + +#pragma once + +#include +#include + +#include +#include +#include +#include + +#ifdef __OBJC__ +#include +#include +#include +#include +typedef id MTLCommandQueue_t; +typedef id MTLCommandBuffer_t; +typedef id MTLComputeCommandEncoder_t; +typedef id MTLSharedEvent_t; +typedef id MTLDevice_t; +#else +typedef void* MTLCommandQueue_t; +typedef void* MTLCommandQueue; +typedef void* MTLCommandBuffer_t; +typedef void* MTLCommandBuffer; +typedef void* MTLComputeCommandEncoder_t; +typedef void* MTLSharedEvent_t; +typedef void* dispatch_queue_t; +typedef void* MTLDevice_t; +#define nil NULL; +#endif + + +namespace at::mps { + +//----------------------------------------------------------------- +// MPSStream +//----------------------------------------------------------------- + +enum class SyncType { + NONE, // no commit to command buffer + COMMIT, // commit and flush the command buffer + COMMIT_AND_WAIT, // flush and wait for command buffer execution to finish + COMMIT_AND_CONTINUE,// commit and continue with a new underlying command buffer + COMMIT_ADAPTIVE, // commit adaptively based on available memory +}; + +class TORCH_API MPSStream +{ +public: + enum Unchecked { UNCHECKED }; + + /// Construct a MPSStream from a Stream. This construction is checked, + /// and will raise an error if the Stream is not, in fact, a MPS stream. + explicit MPSStream(Stream stream); + + ~MPSStream(); + MTLCommandQueue_t commandQueue() const { return _commandQueue; }; + dispatch_queue_t queue() const { return _serialQueue; } + + MPSCommandBuffer* commandBuffer(); + MTLComputeCommandEncoder_t commandEncoder(); + void endKernelCoalescing(); + void synchronize(SyncType syncType); + void fill(id buffer, uint8_t value, size_t length, size_t offset, SyncType syncType = SyncType::NONE); + void copy(id srcBuffer, id dstBuffer, + size_t length, size_t srcOffset, size_t dstOffset, + uint64_t profileId, SyncType syncType = SyncType::NONE); + void copy_and_sync(id srcBuffer, id dstBuffer, + size_t length, size_t srcOffset, size_t dstOffset, + bool non_blocking, uint64_t profileId); + void executeMPSGraph(MPSGraph* mpsGraph, NSDictionary* feeds, NSDictionary* results, SyncType syncType = SyncType::NONE); + void addCompletedHandler(MTLCommandBufferHandler block); + + /// Get the MPS device index that this stream is associated with. + c10::DeviceIndex device_index() const { return _stream.device_index(); } + + MTLCommandQueue_t stream() const { return _commandQueue; }; + + MTLDevice_t device() const { return [_commandQueue device];} + + /// Explicit conversion to Stream. + Stream unwrap() const { return _stream; } + +private: + Stream _stream; + MTLCommandQueue_t _commandQueue = nil; + MPSCommandBuffer* _commandBuffer = nil; + MPSCommandBuffer* _prevCommandBuffer = nil; + MTLComputeCommandEncoder_t _commandEncoder = nil; + MPSGraphExecutionDescriptor *_executionDescriptor = nil; + MPSGraphCompilationDescriptor *_compilationDescriptor = nil; + dispatch_queue_t _serialQueue = nullptr; + // CommitAndContinue is enabled by default + bool _enableCommitAndContinue = true; + + // use synchronize() to access any of these commit functions outside MPSStream + void commit(); + void commitAndWait(); + void commitAndContinue(); + void flush(); +}; + +/** + * Get the current MPS stream + */ +TORCH_API MPSStream* getCurrentMPSStream(); + +/** + * Get the default MPS stream + */ +TORCH_API MPSStream* getDefaultMPSStream(); + +//----------------------------------------------------------------- +// MPSStreamImpl +//----------------------------------------------------------------- + +class TORCH_API MPSStreamImpl +{ + public: + /** + * Gets single instance of the MPSStream. + */ + static MPSStream* getInstance(); + + private: + static MPSStream* _stream; + MPSStreamImpl(); +}; + +} // namespace at::mps diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/CatKernel.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/CatKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..5afa1add4da3f8fb4bf0a41e874e9f1c8b5de000 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/CatKernel.h @@ -0,0 +1,12 @@ +#pragma once + +#include +#include +#include + +namespace at::native { + +using cat_serial_fn = void(*)(const Tensor &, const MaterializedITensorListRef&, int64_t); +DECLARE_DISPATCH(cat_serial_fn, cat_serial_stub); + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Loops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Loops.h new file mode 100644 index 0000000000000000000000000000000000000000..a7a567aa915deaf2aae7d53e3f4cd76ad4f90d04 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/Loops.h @@ -0,0 +1,394 @@ +#pragma once + +// This file provides two functions to help write elementwise kernels: +// +// cpu_kernel(TensorIterator iter, ) +// cpu_kernel_vec(TensorIterator iter, , ) +// +// Both functions may generate vectorized code. The cpu_kernel implementation +// relies on the compiler's auto-vectorization. The cpu_kernel_vec +// implementation uses x86 SIMD intrinsics when available. These functions +// are only intended to be used in the ATen/native/cpu subdirectory, since files +// in other directories are not compiled with AVX/AVX2 enabled. See README.md +// for more details. +// +// For example, to write a multiplication kernel for float: +// +// cpu_kernel(iter, [](float a, float b) { return a * b; }); +// +// Or you may write: +// +// cpu_kernel_vec(iter, +// [](float a, float b) { return a * b; }, +// [](Vectorized a, Vectorized b) { return a * b; }); +// +// See BinaryOpsKernel.cpp for the complete implementation +// +// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace at::native { inline namespace CPU_CAPABILITY { + +using namespace vec; + +template +typename traits::ArgsTuple +dereference_impl(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, + std::index_sequence) { + return std::make_tuple( + c10::load::type>( + data[INDEX] + i * strides[INDEX])...); +} + +template +typename traits::ArgsTuple +dereference(char* C10_RESTRICT data[], const int64_t* strides, int64_t i) { + using Indices = std::make_index_sequence; + return dereference_impl(data, strides, i, Indices{}); +} + +template +typename traits::ArgsTuple +dereference_vec_impl(char* C10_RESTRICT data[], + const typename traits::result_type& opt_scalar, + size_t S, + int64_t i, + std::index_sequence) { + using Vec = typename traits::result_type; + using scalar_t = typename Vec::value_type; + return std::make_tuple( + S == INDEX + 1 ? + opt_scalar : + Vec::loadu(data[INDEX] + i * sizeof(scalar_t))...); +} + +template +typename traits::ArgsTuple +dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& opt_scalar, size_t S, int64_t i) { + using Indices = std::make_index_sequence; + return dereference_vec_impl(data, opt_scalar, S, i, Indices{}); +} + +template ::result_type>>* = nullptr> +inline void +execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) { + using traits = function_traits; + using result_type = typename traits::result_type; + for (; i < n; i++) { + result_type* out_ptr = (result_type*)(data[0] + i * strides[0]); + *out_ptr = c10::guts::apply(op, dereference( + &data[1], + &strides[1], + i)); + } +} + +template ::result_type>>* = nullptr> +inline void +execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t&& op) { + using traits = function_traits; + for (; i < n; i++) { + c10::guts::apply(op, dereference( + &data[0], + &strides[0], + i)); + } +} + +// Basic loop operation (one output, N inputs). May be auto-vectorized +// by the compiler. Supports inputs and outputs of different types. +template +inline void +basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) { + using traits = function_traits; + constexpr int ntensors = traits::arity + 1; + + // Copying strides to temporary array helps auto vectorization in older GCC + // versions. + int64_t strides[ntensors]; + for (const auto arg : c10::irange(ntensors)) { + strides[arg] = strides_[arg]; + } + + execute_op(data, strides, i, n, std::forward(op)); +} + +// the recursive variadic template for iterating over the returned tuple +template +struct TupleOutput { + static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i, + const T &tuple) { + TupleOutput::handle(data, strides, i, tuple); + + auto output = std::get(tuple); + using output_type = decltype(output); + output_type * out_ptr = (output_type *)(data[N - 1] + i * strides[N - 1]); + *out_ptr = output; + } +}; + +// Base case for the above recursive template +template +struct TupleOutput { + static void handle(char *C10_RESTRICT data[], const int64_t *strides, int64_t i, + const T &tuple) { + auto output = std::get<0>(tuple); + using output_type = decltype(output); + output_type* out_ptr = (output_type *)(data[0] + i * strides[0]); + *out_ptr = output; + } +}; + +template +void handle_tuple_outputs(char* C10_RESTRICT data[], + const int64_t* strides, + int64_t i, + const std::tuple &tuple) { + TupleOutput::handle(data, strides, i, tuple); +} + +// Loop operation for `cpu_kernel_multiple_outputs`. +// 1. Use `c10::guts::apply` to make dynamic method invocation +// for the lambda passed in `cpu_kernel_multiple_outputs`. +// 2. Iterate over the members of the returned tuple, set the corresponding +// output tensor by the tuple member in `handle_tuple_outputs` function. +template +inline void +multiple_outputs_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t&& op) { + using traits = function_traits; + + using result_type = typename traits::result_type; + constexpr int num_outputs = std::tuple_size::value; + constexpr int ntensors = traits::arity + num_outputs; + + // Copying strides to temporary array helps auto vectorization in older GCC + // versions. + int64_t strides[ntensors]; + for (const auto arg : c10::irange(ntensors)) { + strides[arg] = strides_[arg]; + } + + for (; i < n; i++) { + auto output = c10::guts::apply(op, dereference( + &data[num_outputs], + &strides[num_outputs], + i)); + handle_tuple_outputs(data, strides, i, output); + } +} + +// Explicitly vectorized loop implementation. All inputs and outputs must be +// the same type and contiguous with one exception: a single input may be +// a scalar (stride 0). It's position is indicated by the argument `S`. If `S` +// is 0, then there are no scalar inputs. +template +inline void +vectorized_loop(char** C10_RESTRICT data_, int64_t n, int64_t S, func_t&& op, vec_func_t&& vop) { + using traits = function_traits; + using scalar_t = typename function_traits::result_type; + using Vec = Vectorized; + constexpr int ntensors = traits::arity + 1; + + char* C10_RESTRICT data[ntensors]; + for (const auto arg : c10::irange(ntensors)) { + data[arg] = data_[arg]; + } + + Vec opt_scalar = Vec(S > 0 ? *(scalar_t*)data[S] : scalar_t(0)); + int64_t i = 0; + for (; i <= n - 2 * Vec::size(); i += 2 * Vec::size()) { + auto args1 = dereference_vec(&data[1], opt_scalar, S, i); + auto args2 = dereference_vec(&data[1], opt_scalar, S, i + Vec::size()); + auto out1 = c10::guts::apply(vop, std::move(args1)); + auto out2 = c10::guts::apply(vop, std::move(args2)); + out1.store(data[0] + i * sizeof(scalar_t)); + out2.store(data[0] + (i + Vec::size()) * sizeof(scalar_t)); + } + if (i < n) { + int64_t strides[ntensors]; + for (const auto arg : c10::irange(ntensors)) { + strides[arg] = (S > 0 && arg == S) ? 0 : sizeof(scalar_t); + } + basic_loop(data, strides, i, n, std::forward(op)); + } +} + + +template +inline void unroll_contiguous_scalar_checks( + const int64_t* /*strides*/, + std::index_sequence<>, + cb_t&& cb) { + cb(0); +} + +template +inline void unroll_contiguous_scalar_checks( + const int64_t* strides, + std::index_sequence, + cb_t&& cb) { + if (is_contiguous_scalar(strides)) { + cb(INDEX0 + 1); + } else { + unroll_contiguous_scalar_checks(strides, std::index_sequence{}, std::forward(cb)); + } +} + +template +struct VectorizedLoop2d { + op_t op; + vop_t vop; + + using traits = function_traits; + static constexpr int ntensors = traits::arity + 1; + using data_t = std::array; + + VectorizedLoop2d(op_t op, vop_t vop): + op(std::move(op)), vop(std::move(vop)) {} + + static void advance(data_t &data, const int64_t *outer_strides) { + for (const auto arg : c10::irange(data.size())) { + data[arg] += outer_strides[arg]; + } + } + + void operator()(char** base, const int64_t *strides, int64_t size0, int64_t size1) { + data_t data; + std::copy_n(base, ntensors, data.data()); + const int64_t *outer_strides = &strides[ntensors]; + + if (is_contiguous(strides)) { + for (const auto i C10_UNUSED : c10::irange(size1)) { + vectorized_loop(data.data(), size0, 0, op, vop); + advance(data, outer_strides); + } + } else { + using Indices = std::make_index_sequence; + unroll_contiguous_scalar_checks(strides, Indices{}, [&](size_t idx) { + if (idx) { + for (const auto i C10_UNUSED : c10::irange(size1)) { + vectorized_loop(data.data(), size0, idx, op, vop); + advance(data, outer_strides); + } + } else { + for (const auto i C10_UNUSED : c10::irange(size1)) { + basic_loop(data.data(), strides, 0, size0, op); + advance(data, outer_strides); + } + } + }); + } + } +}; + +template +VectorizedLoop2d make_vectorized_loop2d( + op_t &&op, vop_t &&vop) { + return VectorizedLoop2d(std::forward(op), std::forward(vop)); +} + +template +void cpu_kernel(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) { + using traits = function_traits; + // this could be extended to work with void return types + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + // dynamic casting not currently supported on CPU + TORCH_INTERNAL_ASSERT(!needs_dynamic_casting::check(iter)); + + iter.for_each([&](char** data, const int64_t* strides, int64_t n) { + // basic loop can handle 1d slices with arbitrary strides, and 1d slices is all that + // iter.for_each is ever sending to the loop lambda + basic_loop(data, strides, 0, n, op); + }, grain_size); + iter.cast_outputs(); +} + +// This function helps write elementwise kernels that requires multiple outputs. +// It follows the similar structure of cpu_kernel. +// Instead of `basic_loop` function, a new `multiple_outputs_loop` function is +// manipulated to handle multiple return values. +// For now `needs_dynamic_casting` check is not added as the passed lambda (`func_t`) +// of `multiple_outputs_loop` returns `std::tuple` instead of `scalar_t`. +// The `gpu_kernel_multiple_outputs` is also implemented without this check, +// We could extend `needs_dynamic_casting` to support both `std::tuple` and +// `thrust::tuple` in the future. +template +void cpu_kernel_multiple_outputs(TensorIteratorBase& iter, func_t&& op, int64_t grain_size = at::internal::GRAIN_SIZE) { + using traits = function_traits; + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + + iter.for_each([&](char** data, const int64_t* strides, int64_t n) { + multiple_outputs_loop(data, strides, 0, n, op); + }, grain_size); + iter.cast_outputs(); +} + +template +void cpu_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, int64_t grain_size = at::internal::GRAIN_SIZE) { + using traits = function_traits; + // this could be extended to work with void return types + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + // dynamic casting not currently supported on CPU, but some kernels (like Fill) + // explicitly dynamic_cast, so we give the opt-out of checking. + if constexpr (check_dynamic_cast) { + TORCH_INTERNAL_ASSERT(!needs_dynamic_casting::check(iter)); + } + + iter.for_each(make_vectorized_loop2d(std::forward(op), std::forward(vop)), grain_size); + iter.cast_outputs(); +} + +template +void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op, const Range& range) { + using traits = function_traits; + constexpr bool result_void = std::is_void_v; + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity && + ((result_void && iter.noutputs() == 0) || (!result_void && iter.noutputs() == 1))); + // dynamic casting not currently supported on CPU + TORCH_INTERNAL_ASSERT(!needs_dynamic_casting::check(iter)); + + iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) { + basic_loop(data, strides, 0, n, op); + }, range); + iter.cast_outputs(); +} + +template +void cpu_serial_kernel(TensorIteratorBase& iter, func_t&& op) { + cpu_serial_kernel(iter, std::forward(op), {0, iter.numel()}); +} + +template +void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop, const Range& range) { + using traits = function_traits; + // this could be extended to work with void return types + TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity); + TORCH_INTERNAL_ASSERT(iter.noutputs() == 1); + // dynamic casting not currently supported on CPU + TORCH_INTERNAL_ASSERT(!needs_dynamic_casting::check(iter)); + + iter.serial_for_each(make_vectorized_loop2d(std::forward(op), std::forward(vop)), range); + iter.cast_outputs(); +} + +template +void cpu_serial_kernel_vec(TensorIteratorBase& iter, func_t&& op, vec_func_t&& vop) { + cpu_serial_kernel_vec(iter, std::forward(op), std::forward(vop), {0, iter.numel()}); +} + +}} // namespace at::native:: diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..8c6424f8b0eac8a9b632666f89e8d5c8b5efd835 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/cpu/ReduceUtils.h @@ -0,0 +1,238 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::native { +inline namespace CPU_CAPABILITY { + +using namespace vec; + +#define AT_DISPATCH_REDUCTION_TYPES(op, ...) \ + [&] { \ + switch (op) { \ + case ReductionType::SUM: { \ + static constexpr auto reduce = ReductionType::SUM; \ + return __VA_ARGS__(); \ + } \ + case ReductionType::MEAN: { \ + static constexpr auto reduce = ReductionType::MEAN; \ + return __VA_ARGS__(); \ + } \ + case ReductionType::MIN: { \ + static constexpr auto reduce = ReductionType::MIN; \ + return __VA_ARGS__(); \ + } \ + case ReductionType::MAX: { \ + static constexpr auto reduce = ReductionType::MAX; \ + return __VA_ARGS__(); \ + } \ + case ReductionType::PROD: { \ + static constexpr auto reduce = ReductionType::PROD; \ + return __VA_ARGS__(); \ + } \ + } \ + }() + +template +inline vec_scalar_t init_value() { + using acc_t = vec_scalar_t; + acc_t val; + if (reduce == ReductionType::SUM || + reduce == ReductionType::MEAN) { + val = static_cast(0); + } else if (reduce == ReductionType::PROD) { + val = static_cast(1); + } else if (reduce == ReductionType::MAX) { + val = -std::numeric_limits::infinity(); + } else { + TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN); + val = std::numeric_limits::infinity(); + } + return val; +} + +template +inline vec_scalar_t init_value(const std::optional& initial) { + using acc_t = vec_scalar_t; + if (initial.has_value()) { + return initial.value().to(); + } else { + return init_value(); + } +} + +template +inline void init(scalar_t* out, int64_t size, const vec_scalar_t& val) { + using Vec = Vectorized>; + map( + [val](Vec x) { return Vec(val); }, + out, + out, + size); +} + +template +inline void init(scalar_t* out, int64_t size, const std::optional& initial) { + using acc_t = vec_scalar_t; + acc_t val = init_value(initial); + init(out, size, val); +} + +// overload with `include_self`, used by scatter_reduce +template +inline void init(scalar_t* out, int64_t size, bool include_self = false) { + using acc_t = vec_scalar_t; + if (!include_self) { + acc_t val = init_value(); + init(out, size, val); + } +} + +template +inline void _init(scalar_t* self_ptr, at::opmath_type* buffer_ptr, int64_t size, bool include_self) { + if (!include_self) { + init, reduce>(buffer_ptr, size, include_self); + } else { + vec::convert(self_ptr, buffer_ptr, size); + } +} + +template +inline typename std::enable_if::value, scalar_t>::type +_max(const scalar_t& x, const scalar_t& y) { + return at::_isnan(y) ? y : std::max(x, y); +} + +template +inline Vectorized _max(const Vectorized& x, const Vectorized& y) { + // vec::maximum propagates NaN + return vec::maximum(x, y); +} + +template +inline typename std::enable_if::value, Vec2>::type +_max(const vec_t& x, const vec_t& y) { + // vec::maximum propagates NaN + return maximum(x, y); +} + +template +inline typename std::enable_if::value, scalar_t>::type +_min(const scalar_t& x, const scalar_t& y) { + return at::_isnan(y) ? y : std::min(x, y); +} + +template +inline Vectorized _min(const Vectorized& x, const Vectorized& y) { + // vec::minimum propagates NaN + return vec::minimum(x, y); +} + +template +inline typename std::enable_if::value, Vec2>::type +_min(const vec_t& x, const vec_t& y) { + // vec::minimum propagates NaN + return minimum(x, y); +} + +template , int> = 0> +inline void map_acc( + const Op& vec_fun, + accumut* output_data, + const accumut* input_data, + const scalar_t* input_data2, + int64_t size) { + using Vec = vec::Vectorized; + using aVec = vec::Vectorized; + int64_t d = 0; + constexpr int64_t kVecSize = Vec::size(); + constexpr int64_t kaVecSize = aVec::size(); + for (d = 0; d < size - (size % kVecSize); d += kVecSize) { + Vec data2_vec = Vec::loadu(input_data2 + d); + auto [data2_avec0, data2_avec1] = convert_to_float(data2_vec); + aVec input_vec0 = aVec::loadu(input_data + d); + aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize); + vec_fun(input_vec0, data2_avec0).store(output_data + d); + vec_fun(input_vec1, data2_avec1).store(output_data + d + kaVecSize); + } + if (size - d > 0) { + int64_t tail_size = size - d; + Vec data2_vec = Vec::loadu(input_data2 + d, tail_size); + auto [data2_avec0, data2_avec1] = convert_to_float(data2_vec); + if (tail_size > kaVecSize) { + aVec input_vec0 = aVec::loadu(input_data + d); + aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize, tail_size - kaVecSize); + vec_fun(input_vec0, data2_avec0).store(output_data + d); + vec_fun(input_vec1, data2_avec1).store(output_data + d + kaVecSize, tail_size - kaVecSize); + } else { + aVec input_vec0 = aVec::loadu(input_data + d, tail_size); + vec_fun(input_vec0, data2_avec0).store(output_data + d, tail_size); + } + } +} + +// for Max and Min, propagate NaN: +template +inline T update(const T& x, const T& y) { + if (reduce == ReductionType::SUM || + reduce == ReductionType::MEAN) { + return x + y; + } else if (reduce == ReductionType::PROD) { + return x * y; + } else if (reduce == ReductionType::MAX) { + return _max(x, y); + } else { + TORCH_INTERNAL_ASSERT(reduce == ReductionType::MIN); + return _min(x, y); + } +} + +template +inline void update(scalar_t* out, const scalar_t* data, int64_t K) { + using Vec = vec::Vectorized>; + map2( + [](Vec x, Vec y) { return update(x, y); }, + out, + out, + data, + K); +} + +template , int> = 0> +inline void update(at::opmath_type* out, const scalar_t* data, int64_t K) { + using opmath_t = at::opmath_type; + using Vec = vec::Vectorized; + map_acc( + [](Vec x, Vec y) { return update(x, y); }, + out, + out, + data, + K); +} + +template +inline void write(scalar_t* out, int64_t count, int64_t K) { + using Vec = vec::Vectorized>; + if (reduce == ReductionType::MEAN) { + if (count > 0) { + vec::map( + [count](Vec x) { return x / Vec(count); }, + out, + out, + K); + } + } +} + +} // namespace CPU_CAPABILITY +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h new file mode 100644 index 0000000000000000000000000000000000000000..0abcdb87a9ca6f3b0029eaa787ac26947809fc69 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizer.h @@ -0,0 +1,130 @@ +#pragma once + +#include +#include +#include +#include + +namespace at { +namespace native { + +Tensor& quantize_tensor_per_tensor_affine( + const Tensor& rtensor, + Tensor& qtensor, + double scale, + int64_t zero_point); +Tensor& quantize_tensor_per_channel_affine( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + Tensor zero_points, + int64_t axis); + +Tensor& quantize_tensor_per_channel_float_qparams( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +Tensor& dequantize_tensor_per_tensor_affine( + const Tensor& qtensor, + Tensor& rtensor, + double scale, + int64_t zero_point); +Tensor& dequantize_tensor_per_channel_affine( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + Tensor zero_points, + int64_t axis); +Tensor& dequantize_tensor_per_channel_float_qparams( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using quantize_tensor_per_tensor_affine_fn = + void (*)(const Tensor& rtensor, Tensor& qtensor, double scale, int64_t zero_point); + +using quantize_tensor_per_channel_affine_fn = void (*)( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using quantize_tensor_per_channel_float_qparams_fn = void (*)( + const Tensor& rtensor, + Tensor& qtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using dequantize_tensor_per_tensor_affine_fn = + void (*)(const Tensor& qtensor, Tensor& rtensor, double scale, int64_t zero_point); + +using dequantize_tensor_per_channel_affine_fn = void (*)( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using dequantize_tensor_per_channel_float_qparams_fn = void (*)( + const Tensor& qtensor, + Tensor& rtensor, + const Tensor& scales, + const Tensor& zero_points, + int64_t axis); + +using quantize_tensor_per_tensor_affine_sub_byte_fn = + void (*)(const Tensor& rtensor, Tensor& qtensor, float scale, float zero_point); + +using dequantize_tensor_per_tensor_affine_sub_byte_fn = + void (*)(const Tensor& qtensor, Tensor& rtensor, float scale, float zero_point); + +DECLARE_DISPATCH( + quantize_tensor_per_tensor_affine_fn, + quantize_tensor_per_tensor_affine_stub); +DECLARE_DISPATCH( + quantize_tensor_per_channel_affine_fn, + quantize_tensor_per_channel_affine_stub); +DECLARE_DISPATCH( + quantize_tensor_per_channel_float_qparams_fn, + quantize_tensor_per_channel_float_qparams_stub); + +DECLARE_DISPATCH( + dequantize_tensor_per_tensor_affine_fn, + dequantize_tensor_per_tensor_affine_stub); +DECLARE_DISPATCH( + dequantize_tensor_per_channel_affine_fn, + dequantize_tensor_per_channel_affine_stub); +DECLARE_DISPATCH( + dequantize_tensor_per_channel_float_qparams_fn, + dequantize_tensor_per_channel_float_qparams_stub); + +DECLARE_DISPATCH( + quantize_tensor_per_tensor_affine_sub_byte_fn, + quantize_tensor_per_tensor_affine_sub_byte_stub); + +DECLARE_DISPATCH( + dequantize_tensor_per_tensor_affine_sub_byte_fn, + dequantize_tensor_per_tensor_affine_sub_byte_stub); + +template +TORCH_API Tensor quantize_tensor( + Tensor rtensor, + Tensor qtensor, + double scale, + int64_t zero_point); +template +TORCH_API Tensor dequantize_tensor( + Tensor qtensor, + Tensor rtensor, + double scale, + int64_t zero_point); + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h new file mode 100644 index 0000000000000000000000000000000000000000..31526c3ec3c52057463cd00f0dd8556160d4d2df --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/AffineQuantizerBase.h @@ -0,0 +1,47 @@ +#pragma once +#include +#include + +namespace at { +namespace native { + +// Quantize a float value into a uint value given scale and zero_point +template +TORCH_API T quantize_val(double scale, int64_t zero_point, float value); +// TODO combine this with quantize_val once the numerics for ARM are aligned +// with it +template +T quantize_val_arm( + const float scale, + const int32_t zero_point, + const float value); +template +void quantize_vec( + double scale, + int64_t zero_point, + const float* src, + T* dst, + size_t count = 8); +template +TORCH_API float dequantize_val(double scale, int64_t zero_point, T value); +template +TORCH_API float dequantize_vec( + double scale, + int64_t zero_point, + const T* src, + float* dst, + size_t count = 8); +template +TORCH_API DST_T requantize_val(double, int64_t, double, int64_t, SRC_T src); + +// Given a multiplier and a zero_point, requantize int32_t computed values back +// to quantized values. See comment above +// make_per_tensor_affine_quantizer function for the usage of int64_t +template +TORCH_API DST_T +requantize_from_int(double multiplier, int64_t zero_point, int64_t src); + +int quantize_val_float_qparams(float scale, float zero_point, float value, int qmin, int qmax); + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..6f8ff918c1d2f3e421922650161aaa41eda9545f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/ConvUtils.h @@ -0,0 +1,62 @@ +#pragma once +#include +#include + +namespace at::native::quantized { +namespace { +// MakeConvOutputShape used from both CPU and CUDA libraries +// and exporting symbol from torch_cpu would probably take more storage +// than duplicating implementation which likely be inlined away +template +at::SmallVector MakeConvOutputShape( + int N, // mini-batch + int M, // output channels + const std::array& input_image_shape, + const std::vector& kernel, + const torch::List& stride, + const torch::List& padding, + const torch::List& dilation); + +#if defined(USE_CUDA) || defined(USE_PYTORCH_QNNPACK) +template <> +at::SmallVector MakeConvOutputShape<2>( + int N, // mini-batch + int M, // output channels + const std::array& input_image_shape, + const std::vector& kernel, + const at::List& stride, + const at::List& padding, + const at::List& dilation) { + const int H = input_image_shape[0]; + const int W = input_image_shape[1]; + const int64_t Y_H = + (H + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1; + const int64_t Y_W = + (W + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1; + return {N, M, Y_H, Y_W}; +} + +template <> +at::SmallVector MakeConvOutputShape<3>( + int N, // mini-batch + int M, // output channels + const std::array& input_image_shape, + const std::vector& kernel, + const at::List& stride, + const at::List& padding, + const torch::List& dilation) { + const int D = input_image_shape[0]; + const int H = input_image_shape[1]; + const int W = input_image_shape[2]; + const int64_t Y_D = + (D + 2 * padding[0] - dilation[0] * (kernel[0] - 1) - 1) / stride[0] + 1; + const int64_t Y_H = + (H + 2 * padding[1] - dilation[1] * (kernel[1] - 1) - 1) / stride[1] + 1; + const int64_t Y_W = + (W + 2 * padding[2] - dilation[2] * (kernel[2] - 1) - 1) / stride[2] + 1; + return {N, M, Y_D, Y_H, Y_W}; +} + +#endif +} // anonymous namespace +} // namespace at::native::quantized diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/Copy.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/Copy.h new file mode 100644 index 0000000000000000000000000000000000000000..d52c8ff0fb2c7f7f6eed17acceb660482144eef9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/Copy.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +namespace at { +namespace native { + +Tensor& quantized_copy_from_float_(Tensor& self, const Tensor& src); +} +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h new file mode 100644 index 0000000000000000000000000000000000000000..1fb7cfbb0e721f83ba5a9194ad72ea98c97d997d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/FakeQuantAffine.h @@ -0,0 +1,67 @@ +#pragma once + +#include +#include +#include + +namespace at { + +struct TensorIterator; + +namespace native { + +using fake_quant_tensor_cachemask_fn = void (*)( + Tensor& output, + Tensor& mask, + const Tensor& input, + float sc, + int64_t z_point, + int64_t quant_min, + int64_t quant_max); + +using fake_quant_tensor_cachemask_tensor_qparams_fn = void (*)( + Tensor& output, + Tensor& mask, + const Tensor& input, + const Tensor& sc, + const Tensor& z_point, + const Tensor& fake_quant_enabled, + int64_t quant_min, + int64_t quant_max); + +using fake_quant_learnable_grad_tensor_fn = void (*)( + TensorIterator& iter, + float scale, + float inv_scale, + int64_t zero_point, + int64_t quant_min, + int64_t quant_max, + float grad_factor); + +DECLARE_DISPATCH(fake_quant_tensor_cachemask_fn, fake_quant_tensor_cachemask_stub); +DECLARE_DISPATCH(fake_quant_tensor_cachemask_tensor_qparams_fn, fake_quant_tensor_cachemask_tensor_qparams_stub); +DECLARE_DISPATCH(fake_quant_learnable_grad_tensor_fn, fake_quant_grad_learnable_tensor_stub); + +using fake_quant_per_channel_fn = void (*)( + TensorIterator &iter, + int64_t quant_min, + int64_t quant_max); + +using fake_quant_per_channel_cachemask_fn = void (*)( + TensorIterator &iter, + TensorIterator &iter_mask, + int64_t quant_min, + int64_t quant_max); + +DECLARE_DISPATCH(fake_quant_per_channel_cachemask_fn, fake_quant_per_channel_cachemask_stub); + +using fake_quant_learnable_per_channel_fn = void (*)( + TensorIterator &iter, + int64_t quant_min, + int64_t quant_max, + float grad_factor); + +DECLARE_DISPATCH(fake_quant_learnable_per_channel_fn, fake_quant_grad_learnable_channel_stub); + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h new file mode 100644 index 0000000000000000000000000000000000000000..0e240b5a8e9afc61f8828f4162f1b89c7ec06bb7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/IndexKernel.h @@ -0,0 +1,14 @@ +#pragma once +#include + +namespace at { +namespace native { +using masked_fill_kernel_quantized_fn = void(*)(TensorIterator& iter, const Scalar& value, double scale, int zero_point); +using index_put_kernel_quantized_fn = void(*)(TensorIterator& iter, IntArrayRef index_size, IntArrayRef index_stride, bool accumulate, double scale, int zero_point); + +DECLARE_DISPATCH(masked_fill_kernel_quantized_fn, masked_fill_kernel_quantized_stub); +DECLARE_DISPATCH(index_put_kernel_quantized_fn, index_put_kernel_quantized_stub); + + +} // native +} // at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h new file mode 100644 index 0000000000000000000000000000000000000000..d73bc0adbc4ef953e0580585ab9261700374a45d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/PackedParams.h @@ -0,0 +1,147 @@ +#pragma once + +#include +#include + +struct LinearPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) = 0; + + // out variant of LinearPackedParamsBase::apply + virtual at::Tensor& apply_out( + const at::Tensor& /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/, + at::Tensor& output) { + throw std::runtime_error( + "apply_out is not implemented for this packed " + "parameter type"); + return output; + } + + virtual at::Tensor& apply_relu_out( + const at::Tensor& /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/, + at::Tensor& output) { + throw std::runtime_error( + "apply_relu_out is not implemented for this packed " + "parameter type"); + return output; + } + + // Corresponding pattern (the ops with `*` are part of the pattern that + // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_output_fp32): + // input -> q* -> dq* -> linear* -> + // qweight -> dq* / + // + // After fusion: + // input -> quantized::linear_with_input_q_dq_qweight_dq_output_fp32* -> + // qweight / + // + // Additional Note: the weight is packed as well + // Params: + // X: float32 Tensor, will be quantized to quint8 in the op + // W_prepack: packed qint8 quantized weight and bias + // Returns: + // Y: float32 Tensor + virtual at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) { + throw std::runtime_error( + "apply_with_input_q_dq_qweight_dq_output_fp32 is not implemented for this packed " + "parameter type"); + return {}; + } + + // Corresponding pattern (the ops with `*` are part of the pattern that + // represents the computation of quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32): + // input -> q* -> dq* -> linear* -> relu* -> + // qweight -> dq* / + // + // After fusion: + // input -> quantized::linear_with_input_q_dq_qweight_dq_relu_output_fp32* -> + // qweight / + // + // Additional Note: the weight is packed as well + // Params: + // input: float32 Tensor, will be quantized to quint8 in the op + // Returns: + // float32 Tensor + virtual at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) { + throw std::runtime_error( + "apply_with_input_q_dq_qweight_dq_relu_output_fp32 is not implemented for this packed " + "parameter type"); + return {}; + } + + virtual at::Tensor apply_dynamic( + at::Tensor input, + bool reduce_range = false) = 0; + virtual at::Tensor apply_dynamic_relu( + at::Tensor input, + bool reduce_range = false) = 0; + + virtual at::Tensor& apply_dynamic_out( + const at::Tensor& /* input */, + at::Tensor& output, + bool /* reduce_range */) { + throw std::runtime_error( + "apply_dynamic_out is not implemented for this packed " + "parameter type"); + return output; + } + virtual at::Tensor& apply_dynamic_relu_out( + const at::Tensor& /* input */, + at::Tensor& output, + bool /* reduce_range */) { + throw std::runtime_error( + "apply_dynamic_relu_out is not implemented for this packed " + "parameter type"); + return output; + } + + virtual std::tuple> unpack() = 0; + + virtual std::optional bias() = 0; + + virtual void set_bias(std::optional /*bias*/) { + throw std::runtime_error( + "set_bias is not implemented for this packed " + "parameter type"); + } +}; + +template +struct ConvPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) = 0; + virtual at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) = 0; + + virtual std::tuple> unpack() = 0; + + virtual torch::List stride() const = 0; + virtual torch::List padding() const = 0; + virtual torch::List output_padding() const = 0; + virtual torch::List dilation() const = 0; + virtual int64_t groups() const = 0; + virtual bool transpose() const = 0; +}; diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h new file mode 100644 index 0000000000000000000000000000000000000000..cf86a13c139a1f429ecb2cc4918c04df9e4b3246 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/BinaryOps.h @@ -0,0 +1,8 @@ +#include + +namespace at { +namespace native { +TORCH_API Tensor +quantized_add(Tensor qa, Tensor qb, double scale, int64_t zero_point); +} +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h new file mode 100644 index 0000000000000000000000000000000000000000..e6f47d611a19f4bcf804b63f20fb06be9a2c1f44 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/EmbeddingPackedParams.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +struct EmbeddingPackedParamsBase : public torch::jit::CustomClassHolder { + virtual at::Tensor embeddingbag_byte( + const at::Tensor& indices, + const std::optional& offsets, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) = 0; + + virtual at::Tensor embeddingbag_4bit( + const at::Tensor& indices, + const std::optional& offsets, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) = 0; + + virtual at::Tensor unpack() = 0; + + virtual int64_t bit_rate() const = 0; + virtual int64_t version() const = 0; +}; diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..800308544a5ec595c0b08480a5907c8d7fac1ec0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/OnednnUtils.h @@ -0,0 +1,457 @@ +#pragma once + +#include +#if AT_MKLDNN_ENABLED() +#include +#include +#include +#include + +#include + +using PrimitiveCacheKey = std::tuple< + double, // input_scale + int64_t, // input_zero_point + std::vector, // input_shape + double, // output_scale + int64_t, // output_zero_point + int64_t, // OMP_number_of_threads + double, // accum_scale + int64_t>; // accum_zero_point + +enum CacheKeyIndex { + InputScale, + InputZeroPoint, + InputShape, + OutputScale, + OutputZeroPoint, + NumOfThreads, +}; + +// Base class of primitive cache +struct PrimitiveCache { + PrimitiveCacheKey key; + + bool hit(const PrimitiveCacheKey& key) { + return this->key == key; + } +}; + +using LinearParams = ideep::matmul_forward_params; +using Conv = dnnl::convolution_forward; +using ConvDesc = dnnl::convolution_forward::primitive_desc; +using ConvParams = ideep::convolution_forward_params; +using Deconv = dnnl::deconvolution_forward; +using DeconvDesc = dnnl::deconvolution_forward::primitive_desc; +using DeconvParams = ideep::deconv_forward_params; + +struct LinearPrimitiveCache : PrimitiveCache { + LinearPrimitiveCache() {} + + LinearPrimitiveCache( + const PrimitiveCacheKey& key, + const LinearParams& param) { + this->key = key; + this->param = param; + } + + LinearParams param; + + // For dynamic qlinear, scale and zero point + // are set at execution time. So we only need to compare + // the rest part of key. + bool hit_dynamic(const PrimitiveCacheKey& new_key) { + auto cached_input_shape = std::get(this->key); + auto new_input_shape = std::get(new_key); + return ( + cached_input_shape == new_input_shape && + std::get(this->key) == std::get(new_key)); + } + + LinearParams& get_param() { + return param; + } +}; + +struct ConvPrimitiveCache : PrimitiveCache { + ConvPrimitiveCache() {} + + ConvPrimitiveCache( + const PrimitiveCacheKey& key, + const ConvParams& params) { + this->key = key; + this->params = params; + } + + ConvParams params; + + ConvParams& get_params() { + return params; + } +}; + +struct DeconvPrimitiveCache : PrimitiveCache { + DeconvPrimitiveCache() {} + + DeconvPrimitiveCache( + const PrimitiveCacheKey& key, + const DeconvParams& params) { + this->key = key; + this->params = params; + } + + DeconvParams params; + + DeconvParams& get_params() { + return params; + } +}; + +enum PostOps { + NoPostOp, + Relu, + LeakyRelu, + Tanh, + Gelu +}; + + +struct PackedLinearWeightsOnednn : public LinearPackedParamsBase { + PackedLinearWeightsOnednn( + std::unique_ptr weight, + std::optional bias, + at::Tensor orig_weight, + std::optional orig_bias) + : weight_(std::move(weight)), + bias_(std::move(bias)), + orig_weight_(std::move(orig_weight)), + orig_bias_(std::move(orig_bias)) { + cache_initialized_flag = std::make_unique(); + } + std::unique_ptr weight_; + std::optional bias_; + at::Tensor orig_weight_; + std::optional orig_bias_; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override; + + at::Tensor apply_leaky_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point, + double negative_slope); + + at::Tensor apply_tanh( + at::Tensor input, + double output_scale, + int64_t output_zero_point); + + std::tuple> unpack() override; + + std::optional bias() override { + return orig_bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + private: + LinearPrimitiveCache prim_cache; + std::unique_ptr cache_initialized_flag; + + template + at::Tensor apply_impl( + at::Tensor input, + double output_scale, + int64_t output_zero_point, + torch::List post_op_args = torch::List()); + + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range=false); + + LinearPrimitiveCache& get_cache() { + return prim_cache; + } +}; + +template +struct PackedConvWeightsOnednn : public ConvPackedParamsBase { + PackedConvWeightsOnednn( + std::unique_ptr weight, + std::optional bias, + at::Tensor orig_weight, + std::optional orig_bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + uint8_t transpose) + : weight_(std::move(weight)), + bias_(std::move(bias)), + orig_weight_(std::move(orig_weight)), + orig_bias_(std::move(orig_bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose) { + cache_initialized_flag = std::make_unique(); + } + + std::unique_ptr weight_; + std::optional bias_; + at::Tensor orig_weight_; + std::optional orig_bias_; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + uint8_t transpose_; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) override; + + at::Tensor apply_add( + const at::Tensor& input, + const at::Tensor& accum, + double output_scale, + int64_t output_zero_point); + + at::Tensor apply_add_relu( + const at::Tensor& input, + const at::Tensor& accum, + double output_scale, + int64_t output_zero_point); + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return (bool)transpose_; + } + + private: + ConvPrimitiveCache conv_prim_cache; + DeconvPrimitiveCache deconv_prim_cache; + std::unique_ptr cache_initialized_flag; + + template + at::Tensor apply_impl( + const at::Tensor& input, + const std::optional& accum, + double output_scale, + int64_t output_zero_point); + + ConvPrimitiveCache& get_conv_cache() { + assert(!transpose()); + return conv_prim_cache; + } + + DeconvPrimitiveCache& get_deconv_cache() { + assert(transpose()); + return deconv_prim_cache; + } +}; + +namespace onednn_utils { + +inline ideep::attr_t create_attr_by_post_op( + const c10::string_view& binary_post_op, + double binary_alpha, + double input1_scale, + int64_t input1_zero_point, + const ideep::tensor::desc& input1_desc, + const c10::string_view& unary_post_op, + const torch::List>& unary_post_op_args, + const c10::string_view& unary_post_op_algorithm) { + using ideep::tensor; + if (binary_post_op == "none") { + if (unary_post_op == "relu") { + return ideep::attr_t::fuse_relu(); + } else if (unary_post_op == "leaky_relu") { + TORCH_CHECK( + unary_post_op_args.size() == 1, + "onednn qlinear: expect one argument for post op leaky_relu but got ", unary_post_op_args.size(), " args"); + auto alpha = unary_post_op_args[0].value().to(); + return ideep::attr_t::fuse_relu_v2(alpha); + } else if (unary_post_op == "tanh") { + return ideep::attr_t::fuse_tanh(); + } else if (unary_post_op == "gelu") { + TORCH_CHECK( + unary_post_op_algorithm == "none" || unary_post_op_algorithm == "tanh", + "onednn qlinear: algorithm for post op gelu must be none or tanh but got ", unary_post_op_algorithm); + auto post_algorithm = unary_post_op_algorithm == "none" ? + dnnl::algorithm::eltwise_gelu_erf : + dnnl::algorithm::eltwise_gelu_tanh; + return ideep::attr_t::fuse_gelu_v2(0.f, 0.f, post_algorithm); + } else if (unary_post_op == "hardtanh") { + TORCH_CHECK( + unary_post_op_args.size() == 2 && + unary_post_op_args[0].has_value() && + unary_post_op_args[1].has_value(), + "hardtanh is expected to have two scalar input: min_val and max_val"); + auto lower_bound_value = + unary_post_op_args[0].value().to(); + auto upper_bound_value = + unary_post_op_args[1].value().to(); + return ideep::attr_t::fuse_clamp(lower_bound_value, upper_bound_value); + } else if (unary_post_op == "hardswish") { + return ideep::attr_t::fuse_hardswish(); + } else if (unary_post_op == "swish") { + return ideep::attr_t::fuse_swish(); + } else { + TORCH_CHECK( + unary_post_op == "none", + "onednn qlinear: unsupported unary post op ", unary_post_op); + } + } else if (binary_post_op == "sum") { + if (unary_post_op == "none") { + return ideep::attr_t::fuse_sum(input1_scale, input1_zero_point); + } else if (unary_post_op == "relu") { + return ideep::attr_t::residual_with_sum_zero_point(input1_scale, input1_zero_point); + } else { + TORCH_CHECK( + false, + "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op sum"); + } + } else if (binary_post_op == "add") { + if (unary_post_op == "none") { + return ideep::attr_t::fuse_binary(ideep::algorithm::binary_add, input1_desc); + } else if (unary_post_op == "relu") { + ideep::post_ops po; + po.append_binary(ideep::algorithm::binary_add, input1_desc); + po.append_eltwise(ideep::algorithm::eltwise_relu, 0, 0); + return ideep::attr_t::attr_post_ops(po); + } else { + TORCH_CHECK( + false, + "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op add"); + } + } else { + TORCH_CHECK( + false, + "onednn qlinear: unsupported binary post op ", binary_post_op); + } + return ideep::attr_t(); +} + +// ONEDNN requires symmetric quantization of weight +// Use this util function to check. +inline bool is_weight_symmetric_quant( + const at::Tensor& weight, + bool is_transposed_conv) { + bool is_symmetric = true; + const auto qtype = weight.qscheme(); + if (qtype == c10::kPerTensorAffine) { + is_symmetric &= (weight.q_zero_point() == 0); + } else if (qtype == c10::kPerChannelAffine) { + if (is_transposed_conv) { + // This case is currently not supported in PyTorch + // but we do not want to raise an error in this util function. + is_symmetric = false; + } else { + auto output_channels = weight.size(0); + for (int i = 0; i < output_channels; ++i) { + auto zp = weight.q_per_channel_zero_points()[i].item(); + is_symmetric &= (zp == 0); + } + } + } else { + // This case is currently not supported in PyTorch + // but we do not want to raise an error in this util function. + is_symmetric = false; + } + return is_symmetric; +} + +// When qengine is x86, use this util func to check if onednn kernel +// is preferred than fbgemm's to get better performance. +inline bool should_use_onednn_quant( + const at::Tensor& weight, + bool is_transposed_conv, + int groups, + torch::List output_padding) { + // Performance of onednn is only validated on Linux right now. + // Also, the heuristics for dispatching are based on perf data on Linux. + // So, for x86 qengine, we always use fbgemm kernels if OS is not Linux. + // TODO Support more OSs. +#if !defined(__linux__) + return false; +#else + bool vnni_available = cpuinfo_has_x86_avx512vnni(); + bool w_sym_quant = + is_weight_symmetric_quant(weight, is_transposed_conv); + bool opad_all_zero = + std::all_of(output_padding.begin(), output_padding.end(), [](int i) { return i==0; }); + return vnni_available && (groups <= 100) && w_sym_quant && opad_all_zero; +#endif +} + +} // onednn_utils + +at::Tensor _qconv_prepack_onednn( + at::Tensor weight, // from CPU backend instead of QuantizedCPU + at::Tensor weight_scales, // Weight zero points must be 0 for onednn + double input_scale, + int64_t input_zero_point, + torch::List stride, + torch::List padding, + torch::List dilation, + int64_t groups, + std::optional> input_shape=std::nullopt); + +#endif // #if AT_MKLDNN_ENABLED() diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..b217c757740b30764c0b4c2b278d4fb0d1829fc6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QnnpackUtils.h @@ -0,0 +1,527 @@ +#pragma once + +#ifdef USE_PYTORCH_QNNPACK +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + +#include +inline int kPaddingChannels = 8; +struct QnnpackOperatorDeleter { + void operator()(pytorch_qnnp_operator_t op) { + pytorch_qnnp_delete_operator(op); + } +}; + +// PackedWeight struct for QNNPACK stores the original Weight and Bias as +// QNNPACK currently does not support an unpack function. +// For PyTorch Mobile, once the model is scripted and serialized we don't need +// to call unpack, so we can save some memory by checking for this case and free +// the original weights after packing. +// Input scale is set to null in pre-pack step. QNNPACK needs bias quantized +// with input scale which is available at runtime in pytorch. During runtime if +// input scale value changes then we requantize bias with the updated scale. For +// inference we expect the graph to be static so the input scale should not +// change across consecutive inference calls. +struct PackedLinearWeightsQnnp : public LinearPackedParamsBase { + PackedLinearWeightsQnnp( + std::unique_ptr w, + at::Tensor orig_weight, + at::Tensor bias, + std::optional input_scale, + at::Tensor w_scales, + std::vector&& w_zps) + : w(std::move(w)), + orig_weight(std::move(orig_weight)), + bias_(at::native::mobile::allocate_padded_contiguous_if_needed( + bias, bias.suggest_memory_format())), + per_channel_(this->orig_weight.qscheme() == at::kPerChannelAffine), + input_scale(std::move(input_scale)), + w_scales(std::move(w_scales)), + w_zero_points(std::move(w_zps)), + q_scheme(this->orig_weight.qscheme()) { + weight_sizes = this->orig_weight.sizes().vec(); + } + + std::unique_ptr w; + at::Tensor orig_weight; + at::Tensor bias_; + bool per_channel_; + std::optional input_scale; + at::Tensor w_scales; + std::vector w_zero_points; + std::vector requantization_scales; + std::vector weight_sizes; + c10::QScheme q_scheme; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range=false) override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range=false) override; + + std::tuple> unpack() override; + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + bool per_channel() const { + return per_channel_; + } + + private: + std::mutex qnnp_mutex_; + +#ifdef USE_XNNPACK + xnnpack_operator xnnp_linear_op; + + template + at::Tensor apply_impl_xnnp( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +#endif // USE_XNNPACK + + template + at::Tensor apply_impl( + at::Tensor input, + double output_scale, + int64_t output_zero_point); + + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range); +}; + +template +struct PackedConvWeightsQnnp : public ConvPackedParamsBase { + PackedConvWeightsQnnp( + std::unique_ptr w, + at::Tensor orig_weight, + at::Tensor bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose, + std::optional input_scale, + std::vector kernel, + at::Tensor w_scale, + std::vector&& w_zps, + bool is_per_channel) + : w(std::move(w)), + orig_weight(std::move(orig_weight)), + bias(std::move(bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose), + is_per_channel_(is_per_channel), + input_scale(input_scale), + kernel_(std::move(kernel)), + w_scales(std::move(w_scale)), + w_zero_points(std::move(w_zps)) { + const bool any_padding = std::any_of( + padding_.begin(), padding_.end(), [](const auto& e) { return e != 0; }); + const size_t kernel_size = + std::accumulate(kernel_.begin(), kernel_.end(), 1, std::multiplies<>()); + + const size_t group_input_channels = transpose + ? this->orig_weight.size(0) / groups + : this->orig_weight.size(1); + const size_t group_output_channels = transpose + ? this->orig_weight.size(1) + : this->orig_weight.size(0) / groups; + + const size_t kernel_depth = kSpatialDim == 3 ? kernel_[0] : 1; + const size_t kernel_height = kernel_[kSpatialDim - 2]; + const size_t kernel_width = kernel_[kSpatialDim - 1]; + + pytorch_qnnp_ukernel_type ukernel_type; + if (transpose_) { + ukernel_type = pytorch_qnnp_ukernel_type_conv; + } else { + ukernel_type = pytorch_qnnp_ukernel_type_none; + + const bool has_depthwise_dimensions = + (kSpatialDim == 2 && + ((kernel_height == 3 && kernel_width == 3) || + (kernel_height == 5 && kernel_width == 5))) || + (kSpatialDim == 3 && kernel_height == 3 && kernel_width == 3 && + kernel_depth == 3); + const bool has_depthwise_grouping = + group_input_channels == 1 && group_output_channels == 1 && groups > 1; + + if (has_depthwise_dimensions && has_depthwise_grouping) { + ukernel_type = pytorch_qnnp_ukernel_type_dwconv; + } else if ( + kernel_size == 1 && + std::all_of( + stride_.begin(), + stride_.end(), + [](const auto& e) { return e == 1; }) && + !any_padding) { + ukernel_type = group_input_channels >= SIZE_MAX + ? pytorch_qnnp_ukernel_type_xzp_gemm + : pytorch_qnnp_ukernel_type_gemm; + } else { + ukernel_type = pytorch_qnnp_ukernel_type_conv; + } + } + + if (is_per_channel && ukernel_type == pytorch_qnnp_ukernel_type_xzp_gemm) { + TORCH_INTERNAL_ASSERT( + false, "Per channel quantized weights are not supported for XZP kernels"); + } + + pytorch_qnnp_operator_t convolution{nullptr}; + // Initially all the params are set to zero. + convolution = static_cast( + calloc(1, sizeof(struct pytorch_qnnp_operator))); + if (convolution == nullptr) { + TORCH_INTERNAL_ASSERT( + false, "failed to allocate %zu bytes for pytorch_qnnp_operator structure", + sizeof(struct pytorch_qnnp_operator)); + } + + convolution_op = + std::unique_ptr( + convolution); + + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + convolution->ukernel_type = ukernel_type; + convolution->groups = groups; + convolution->group_input_channels = group_input_channels; + convolution->group_output_channels = group_output_channels; + convolution->kernel_depth = kernel_depth; + convolution->kernel_height = kernel_height; + convolution->kernel_width = kernel_width; + convolution->stride_depth = kSpatialDim == 3 ? stride_[0] : 1; + convolution->stride_height = stride_[kSpatialDim - 2]; + convolution->stride_width = stride_[kSpatialDim - 1]; + convolution->dilation_depth = kSpatialDim == 3 ? dilation_[0] : 1; + convolution->dilation_height = dilation_[kSpatialDim - 2]; + convolution->dilation_width = dilation_[kSpatialDim - 1]; + convolution->input_padding_height = padding_[kSpatialDim - 2]; + convolution->input_padding_width = padding_[kSpatialDim - 1]; + convolution->input_padding_depth = kSpatialDim == 3 ? padding_[0] : 0; + convolution->per_channel = is_per_channel_; + convolution->transpose = transpose_; + + const uint32_t kr = pytorch_qnnp_params.q8conv.kr; + const size_t k_stride = (group_input_channels + (kr - 1)) & -kr; + + size_t zero_size = sizeof(uint8_t) * k_stride; + size_t zero_offset = 0; + + if (transpose_) { + convolution->adjustment_width = output_padding_[1]; + convolution->adjustment_height = output_padding_[0]; + if (group_input_channels < 8) { + zero_size += 8; + zero_offset = 8; + } + } else { + zero_buffer_size = 0; + if (any_padding) { + zero_size = 0; + zero_offset = 0; + if (ukernel_type == pytorch_qnnp_ukernel_type_dwconv) { + const uint32_t cr = pytorch_qnnp_params.q8dw9.cr; + const size_t group_stride = (groups + (cr - 1)) & -cr; + if (groups >= 8) { + zero_size = sizeof(uint8_t) * group_stride; + zero_offset = 0; + } else { + zero_size = sizeof(uint8_t) * group_stride + 8; + zero_offset = sizeof(uint8_t) * 8; + } + } else if ( + ukernel_type == pytorch_qnnp_ukernel_type_conv || + ukernel_type == pytorch_qnnp_ukernel_type_gemm) { + if (group_input_channels >= 8) { + zero_size = sizeof(uint8_t) * k_stride; + zero_offset = 0; + } else { + zero_size = sizeof(uint8_t) * k_stride + 8; + zero_offset = 8; + } + } + } + } + + // NOLINTNEXTLINE(clang-analyzer-optin.portability.UnixAPI) + void* zero_buffer = malloc(zero_size); + if (zero_buffer == nullptr) { + pytorch_qnnp_delete_operator(convolution); + TORCH_INTERNAL_ASSERT( + false, "failed to allocate %zu bytes for zero padding", + zero_size); + } + // Need to set to input zero point + // memset(zero_buffer, input_zero_point, zero_size); + zero_buffer_size = zero_size; + convolution->zero_buffer = zero_buffer; + convolution->zero_pointer = (void*)((uintptr_t)zero_buffer + zero_offset); + } + + std::unique_ptr convolution_op; + #ifdef USE_XNNPACK + xnnpack_operator xnnp_convolution_op; + #endif // USE_XNNPACK + std::unique_ptr w; + at::Tensor orig_weight; + at::Tensor bias; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + bool transpose_; + bool is_per_channel_; + std::optional input_scale; + std::vector kernel_; + at::Tensor w_scales; + std::vector w_zero_points; + std::vector requantization_scales; + size_t zero_buffer_size; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range=false) override; + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return transpose_; + } + + bool per_channel() const { + return is_per_channel_; + } + + private: + std::mutex qnnp_mutex_; + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); + +#ifdef USE_XNNPACK + template + at::Tensor apply_impl_xnnp( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +#endif // USE_XNNPACK +}; + +enum class Activation : uint8_t { NONE = 0, RELU = 1 }; + +#if defined(__ANDROID__) && !defined(__NDK_MAJOR__) +template +inline float Round(const float x) { + return ::nearbyintf(x); +} +inline double Round(const double x) { + return ::nearbyint(x); +} +#else +template +inline T Round(const T x) { + return std::nearbyint(x); +} +#endif + +template +inline T QuantizeValue(float scale, int32_t zero_point, float value) { + const int32_t qmin = std::numeric_limits::min(); + const int32_t qmax = std::numeric_limits::max(); + auto r = zero_point + static_cast(Round(value / scale)); + r = std::max(r, qmin); + r = std::min(r, qmax); + return static_cast(r); +} + +template +inline std::pair activationLimits( + float scale, + int32_t zero_point, + Activation Ac) { + switch (Ac) { + case Activation::NONE: + return {std::numeric_limits::min(), + std::numeric_limits::max()}; + case Activation::RELU: + return {QuantizeValue(scale, zero_point, 0.0), + std::numeric_limits::max()}; + default: +#ifdef _MSC_VER + __assume(0); +#else + __builtin_unreachable(); +#endif + } +} + +namespace at { +namespace native { +namespace qnnp_avgpool_helper { +Tensor qnnpack_avg_pool2d( + Tensor input, + IntArrayRef kernel_size, + IntArrayRef stride, + IntArrayRef padding, + bool ceil_mode, + bool count_include_pad, + std::optional divisor_override); +} // qnnp_avgpool_helper +} // namespace native +} // namespace at + +namespace { +C10_UNUSED std::vector generate_requantization_scales( + const at::Tensor& weight_scales, + const float input_scale, + const float output_scale, + std::vector& requant_scales) { + // Since weight scale is allocated with padding + // weight_scales.numel() gives us padded num elements. + const auto num_output_channels_padded = weight_scales.numel(); + float *const weight_scales_data = weight_scales.data_ptr(); + if (static_cast(requant_scales.size()) < num_output_channels_padded) { + requant_scales.resize(num_output_channels_padded); + } + for (const auto i : c10::irange(num_output_channels_padded)) { + const auto inverse_output_scale = 1.f /output_scale; + requant_scales[i] = (weight_scales_data[i] * input_scale) * inverse_output_scale; + TORCH_CHECK( + (requant_scales[i] > 0.0f && std::isnormal(requant_scales[i])), + "failed to create op with requantization scale: ", + requant_scales[i], + ": requantization scale must be finite and positive"); + } + return requant_scales; +} + +C10_UNUSED std::pair, at::Tensor> make_zero_points_and_scales_tensor( + const at::Tensor& weight_contig, + bool transpose = false, + uint32_t groups = 1 + ) { + const int out_ch_idx = transpose ? 1 : 0; + const auto num_output_channels = weight_contig.size(out_ch_idx) * (transpose ? groups : 1); + // Add 8 to account for bufferring needed by QNNPACK. + const auto num_output_channels_padded = num_output_channels + kPaddingChannels; + const auto qtype = weight_contig.qscheme(); + std::vector weight_zp(num_output_channels_padded, 0); + // Adjust weight zero point, similar to weight data. + if (qtype == at::kPerTensorAffine) { + for (const auto i : c10::irange(num_output_channels)) { + weight_zp[i] = (uint8_t)(weight_contig.q_zero_point() + 128); + } + } else if (qtype == at::kPerChannelAffine) { + TORCH_CHECK( + weight_contig.q_per_channel_zero_points().scalar_type() == at::kLong, + "Per channel zero points dtype must be long int."); + const int64_t* per_channel_zero_points = + weight_contig.q_per_channel_zero_points().data_ptr(); + for (const auto i : c10::irange(num_output_channels)) { + weight_zp[i] = (uint8_t)(per_channel_zero_points[i] + 128); + } + } else { + TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme."); + } + at:: Tensor weight_scales = + at::empty( + {num_output_channels_padded}, + at::device(at::kCPU).dtype(at::kFloat)); + float *const weight_scales_data = weight_scales.data_ptr(); + if (qtype == at::kPerTensorAffine) { + for (const auto i : c10::irange(num_output_channels)) { + weight_scales_data[i] = weight_contig.q_scale(); + } + } else if (qtype == at::kPerChannelAffine) { + TORCH_CHECK( + weight_contig.q_per_channel_scales().scalar_type() == at::kDouble, + "Per channel scales dtype must be double."); + const double *const per_channel_scales = + weight_contig.q_per_channel_scales().data_ptr(); + for (const auto i : c10::irange(num_output_channels)) { + weight_scales_data[i] = static_cast(per_channel_scales[i]); + } + } else { + TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme."); + } + for (const auto i : c10::irange(num_output_channels, num_output_channels_padded)) { + weight_scales_data[i] = 1.f; + } + return {weight_zp, weight_scales}; +} +} // namespace + +#endif diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..0b026c739786a0b68ccf779f2724c1c4607998e1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantUtils.h @@ -0,0 +1,239 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + +namespace quant_utils { +namespace { + float RawUint16ToFp16(unsigned short value) { + // Convert raw 16 bits half precision floating point number + // to single precision floating point number. + const unsigned short sign_bits = value >> 15; + const unsigned short exponent_bits = value >> 10 & 0x1f; + const unsigned short significand_bits = value & 0x3ff; + + const float sign = sign_bits ? -1 : 1; + const float significand = + 1 + significand_bits * 0.0009765625f; // 0.0009765625f = 0x1p-10 = 2^-10; + const float exponent = exponent_bits - 0xf; + + return sign * std::ldexp(significand, exponent); +} + +template +bool CheckAndSaturate(T max_val, T* element) { + if (*element > max_val) { + *element = max_val; + return true; + } + if (*element < -max_val) { + *element = -max_val; + return true; + } + return false; +} +} +using namespace std; +// A structure to hold quantization parameters 'scale' and 'zero_point'. +// The meaning of these values is as the constants in the quantization equation +// +// real_value = scale * (quantized_value - zero_point) +// +// In other words, 'zero_point' is the quantized value that corresponds +// to the real value 0, and 'scale' is the difference of real values +// corresponding to consecutive quantized values. +struct TensorQuantizationParams { + double scale; + std::int32_t zero_point; + int precision; +}; + +// Use fp16_min as the small scale cutoff because we don't want to use scales in +// fp16 subnormal range. This is to be consistent with Glow and FakeLowP +// implementation for NNPI. +constexpr float SMALL_SCALE_THRESHOLD = 6.1e-5f; + +// Following implementation should be identical to fbgemm::ChooseQuantizationParams +inline TensorQuantizationParams ChooseQuantizationParams( + float min, + float max, + int32_t qmin, + int32_t qmax, + bool preserve_sparsity = false, + bool force_scale_power_of_two = false, + bool reduce_range = false) { + TORCH_CHECK( + min <= max, + "In ChooseQuantizationParams, min should be less than or equal to max"); + + if (reduce_range) { + qmin = qmin/2; + qmax = qmax/2; + } + if (min < 0 && max > 0 && preserve_sparsity) { + int symmetric_qmin = -((qmax - qmin) / 2 + 1); + int symmetric_qmax = (qmax - qmin) / 2; + double max_scale = + std::max(fabs(min / symmetric_qmin), fabs(max / symmetric_qmax)); + min = max_scale * symmetric_qmin; + max = max_scale * symmetric_qmax; + } + + // We extend the [min, max] interval to ensure that it contains 0. + // Otherwise, we would not meet the requirement that 0 be an exactly + // representable value. + min = std::min(min, 0.f); + max = std::max(max, 0.f); + + TORCH_CHECK( + qmin < qmax, + "In ChooseQuantizationParams, qmin should be less than qmax"); + + // Use double precision for intermediate computation but use single precision + // in final number to reflect the actual number used during quantization. + double scale = (static_cast(max) - min) / (qmax - qmin); + // If scale is 0 or too small so its reciprocal is infinity, we arbitrary + // adjust the scale to 0.1 . We want to avoid scale's reciprocal being + // infinity because some of fbgemm code pre-computes scale's reciprocal to do + // multiplication instead of division in the time critical part of code. + if (float(scale) == 0.0f || std::isinf(1.0f / float(scale))) { + scale = 0.1; + } + TORCH_CHECK(scale > 0, "quantization scale should be > 0"); + + if (force_scale_power_of_two) { + if (scale < 1) { + scale = 1.0 / (1 << static_cast(floor(log(1.0 / scale) / log(2)))); + } else { + scale = 1 << static_cast(ceil(log(scale) / log(2))); + } + } + + // Cut off small scale + if (scale < SMALL_SCALE_THRESHOLD) { + float org_scale = scale; + scale = SMALL_SCALE_THRESHOLD; + // Adjust the min and max based on the new scale + if (min == 0.0f) { + max = SMALL_SCALE_THRESHOLD * (qmax - qmin); + } else if (max == 0.0f) { + min = -SMALL_SCALE_THRESHOLD * (qmax - qmin); + } else { + float amplifier = SMALL_SCALE_THRESHOLD / org_scale; + min *= amplifier; + max *= amplifier; + } + } + + // Zero-point computation. + // First the initial floating-point computation. The zero-point can be + // determined from solving an affine equation for any known pair + // (real value, corresponding quantized value). + // We know two such pairs: (rmin, qmin) and (rmax, qmax). + // The arithmetic error on the zero point computed from either pair + // will be roughly machine_epsilon * (sum of absolute values of terms) + // so we want to use the variant that adds the smaller terms. + double zero_point_from_min = qmin - min / static_cast(scale); + double zero_point_from_max = qmax - max / static_cast(scale); + double zero_point_from_min_error = + std::abs(qmin) - std::abs(min / static_cast(scale)); + double zero_point_from_max_error = + std::abs(qmax) - std::abs(max / static_cast(scale)); + double initial_zero_point = + zero_point_from_min_error < zero_point_from_max_error + ? zero_point_from_min + : zero_point_from_max; + + // for symmetric quantization (preserve_sparsity == true), we force zero_point + // to be a middle value between qmin and qmax. + // If either min or max is 0, then we just use 0 as zero_point. + if (min < 0 && max > 0 && preserve_sparsity) { + initial_zero_point = static_cast(qmin + qmax) / 2; + } + + // Now we need to nudge the zero point to be an integer + // (our zero points are integer, and this is motivated by the requirement + // to be able to represent the real value "0" exactly as a quantized value, + // which is required in multiple places, for example in Im2col with zero + // padding). + int32_t nudged_zero_point = 0; + if (initial_zero_point < qmin) { + nudged_zero_point = qmin; + } else if (initial_zero_point > qmax) { + nudged_zero_point = qmax; + } else { + nudged_zero_point = nearbyint(initial_zero_point); + } + + TensorQuantizationParams result; + result.scale = scale; + result.zero_point = nudged_zero_point; + return result; +} + +// This function helps to convert the Conv1D dimensions usable by the Conv2d op. +constexpr int64_t kConv1dSqueezeDim = 0; +static C10_UNUSED torch::List MakeArgForConv1d(const torch::List& arg, + int64_t base_value) { + TORCH_CHECK(!arg.empty(), "Argument must have elements."); + torch::List result({arg.get(0), base_value}); + if (arg.size() == 1) { + result[1] = arg.get(0); + } else { + result[1] = arg.get(1); + } + result[kConv1dSqueezeDim] = base_value; + return result; +} + +// The range for using FP16 quantization of weights requires that the elements +// should be in the range of [5.96e-8, 65504]. If it is out of range, then the +// number will be saturated to max or min representable values by FP16. +inline void HandleWeightsSaturation(int64_t N, float* weight) { + const float kFp16Max = RawUint16ToFp16(0x7BFF); + bool found_out_of_range = false; + for (const auto i : c10::irange(N)) { + bool saturate = CheckAndSaturate(kFp16Max, weight + i); + if (saturate) { + found_out_of_range = true; + } + } + if (found_out_of_range) { + TORCH_WARN("FOUND weight out of range "); + } +} + +// Util function for quantizing bias. +inline at::Tensor QuantizeBias( + bool is_per_channel, + const at::Tensor& bias, + const at::Tensor& weight_contig, + double input_scale) { + at::Tensor qbias; + if (is_per_channel) { + auto bias_quant_scales = + weight_contig.q_per_channel_scales() * input_scale; + auto bias_zp = at::zeros(bias_quant_scales.sizes(), c10::kInt); + qbias = at::native::quantize_per_channel( + bias, bias_quant_scales, bias_zp, 0, c10::kQInt32); + } else { + qbias = at::native::quantize_per_tensor( + bias, weight_contig.q_scale() * input_scale, 0, c10::kQInt32); + } + return qbias; +} + +} // namespace quant_utils diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h new file mode 100644 index 0000000000000000000000000000000000000000..9257f57b65dcd9355f0c6a7c35cecb1805e13baa --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/QuantizedOps.h @@ -0,0 +1,258 @@ +#pragma once +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +using qrelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qrelu_leaky_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/, + const Scalar& /*negval_*/); +using qgelu_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, GeluType /* approximate */); +using qsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/, double output_scale, int64_t output_zero_point); +using qhardsigmoid_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qclamp_fn = void (*)( + const at::Tensor& /*qx*/, + const Scalar& min, + const Scalar& max, + at::Tensor& /*qy*/); +using qclamp_minmax_fn = void (*)( + const at::Tensor& /*qx*/, + const Scalar& /*min or max*/, + at::Tensor& /*qy*/); +using qthreshold_fn = void (*)( + const at::Tensor& /*qx*/, + const Scalar& threshold, + const Scalar& value, + at::Tensor& /*qy*/); +using qtanh_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qelu_fn = void(*)( + const at::Tensor& /*qx*/, + const Scalar& /*alpha*/, + const Scalar& /*scale*/, + const Scalar& /*input_scale*/, + at::Tensor& /*qy*/); +using qbinary_fn = + void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Tensor& /*other*/); +using qadd_scalar_fn = + void (*)(Tensor& /*out*/, const Tensor& /*self*/, const Scalar& other /*other*/); +using qhardswish_fn = void (*)(const at::Tensor& /*qx*/, at::Tensor& /*qy*/); +using qdropout_fn = void(*)( + const at::Tensor& /*qx*/, + const Scalar& /*p*/, + bool training /*training*/, + at::Tensor& /*qy*/); +using qmaxpool_2d_fn = void (*)( + const Tensor& qx, + int64_t iC, // input/output channels + int64_t iH, + int64_t iW, // input sizes + int64_t oH, + int64_t oW, // output sizes + int64_t kH, + int64_t kW, // kernel size + int64_t sH, + int64_t sW, // strides + int64_t pH, + int64_t pW, // padding + int64_t dH, + int64_t dW, // dilation + Tensor& qy); +using qmaxpool_3d_fn = void (*)( + const Tensor& qx, + int64_t iC, // input/output channels + int64_t iT, + int64_t iH, + int64_t iW, // input sizes + int64_t oT, + int64_t oH, + int64_t oW, // output sizes + int64_t kT, + int64_t kH, + int64_t kW, // kernel size + int64_t sT, + int64_t sH, + int64_t sW, // strides + int64_t pT, + int64_t pH, + int64_t pW, // padding + int64_t dT, + int64_t dH, + int64_t dW, // dilation + Tensor& qy); +using qadaptive_avg_pool2d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t sizeB, + int64_t sizeC, + int64_t isizeH, + int64_t isizeW, + int64_t osizeH, + int64_t osizeW, + int64_t istrideB, + int64_t istrideC, + int64_t istrideH, + int64_t istrideW); +using qadaptive_avg_pool3d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t sizeB, + int64_t sizeC, + int64_t isizeD, + int64_t isizeH, + int64_t isizeW, + int64_t osizeD, + int64_t osizeH, + int64_t osizeW, + int64_t istrideB, + int64_t istrideC, + int64_t istrideD, + int64_t istrideH, + int64_t istrideW); +using qavg_pool2d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t nBatch, + int64_t nInputPlane, + int64_t inputWidth, + int64_t inputHeight, + int64_t outputWidth, + int64_t outputHeight, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + bool count_include_pad, + std::optional divisor_override); + +using qavg_pool3d_fn = void (*)( + const Tensor& qx, + Tensor& qy, + int64_t nBatch, + int64_t nInputPlane, + int64_t inputWidth, + int64_t inputHeight, + int64_t inputDepth, + int64_t outputWidth, + int64_t outputHeight, + int64_t outputDepth, + int kW, + int kH, + int kD, + int dW, + int dH, + int dD, + int padW, + int padH, + int padD, + bool count_include_pad, + std::optional divisor_override); + +using qupsample_bilinear2d_fn = void (*)( + Tensor& output, + const Tensor& input, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width, + int64_t nbatch, + int64_t channels, + bool align_corners, + std::optional scales_h, + std::optional scales_w); + +using qcat_nhwc_fn = Tensor (*)( + const MaterializedITensorListRef& qxs, + int64_t dim, + double scale, + int64_t zero_point); +using qtopk_fn = void(*)(Tensor&, Tensor&, const Tensor&, int64_t, int64_t, bool, bool); + +using qbatch_norm_fn = void(*)(int64_t, int64_t, int64_t, int64_t, int64_t, const Tensor&, const Tensor&, const Tensor&, Tensor&); + +using qnormalize_fn = void (*)( + const Tensor& /* X */, + const Tensor& /* gamma */, + const Tensor& /* beta */, + bool /* affine_per_channel */, + int /* num_channels */, + int /* num_groups */, + int64_t /* M */, + int64_t /* N */, + double /* eps */, + Tensor* /* Y */); + +using qmean_inner_dim_fn = void (*)( + const Tensor& /* X */, + OptionalIntArrayRef /* opt_dim */, + bool /* keepdim */, + std::optional /* opt_dtype */, + Tensor& /* Y */); + +using qstd_inner_dim_fn = void (*)( + const Tensor& /* X */, + OptionalIntArrayRef /* dim */, + const std::optional& /* correction */, + bool /* keepdim */, + Tensor& /* Y */); + +using qnormalize_nhwc_fn = void (*)( + const Tensor& /* X */, + const Tensor& /* gamma */, + const Tensor& /* beta */, + bool /* affine_per_channel */, + int /* num_channels */, + int /* num_groups */, + int64_t /* M */, + int64_t /* N */, + double /* eps */, + Tensor* /* Y */); + +using qprelu_fn = void (*)(Tensor& /*out*/, const Tensor& /*qx*/, + const Tensor& /*qw*/); + +DECLARE_DISPATCH(qadaptive_avg_pool2d_fn, qadaptive_avg_pool2d_nhwc_stub); +DECLARE_DISPATCH(qadaptive_avg_pool3d_fn, qadaptive_avg_pool3d_ndhwc_stub); +DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_relu_stub); +DECLARE_DISPATCH(qadd_scalar_fn, qadd_scalar_stub); +DECLARE_DISPATCH(qavg_pool2d_fn, qavg_pool2d_nhwc_stub); +DECLARE_DISPATCH(qavg_pool3d_fn, qavg_pool3d_nhwc_stub); +DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_relu_stub); +DECLARE_DISPATCH(qbatch_norm_fn, qbatch_norm_stub); +DECLARE_DISPATCH(qbinary_fn, qadd_relu_stub); +DECLARE_DISPATCH(qbinary_fn, qadd_stub); +DECLARE_DISPATCH(qbinary_fn, qmul_relu_stub); +DECLARE_DISPATCH(qbinary_fn, qmul_stub); +DECLARE_DISPATCH(qcat_nhwc_fn, qcat_nhwc_stub); +DECLARE_DISPATCH(qcat_nhwc_fn, qcat_relu_nhwc_stub); +DECLARE_DISPATCH(qclamp_fn, qclamp_stub); +DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_min_stub); +DECLARE_DISPATCH(qclamp_minmax_fn, qclamp_max_stub); +DECLARE_DISPATCH(qelu_fn, qelu_stub); +DECLARE_DISPATCH(qhardsigmoid_fn, qhardsigmoid_stub); +DECLARE_DISPATCH(qhardswish_fn, qhardswish_stub); +DECLARE_DISPATCH(qdropout_fn, qdropout_stub); +DECLARE_DISPATCH(qmaxpool_2d_fn, qmaxpool_2d_nhwc_stub); +DECLARE_DISPATCH(qmaxpool_3d_fn, qmaxpool_3d_nthwc_stub); +DECLARE_DISPATCH(qnormalize_fn, quantized_normalize_stub); +DECLARE_DISPATCH(qnormalize_nhwc_fn, quantized_groupnorm_nhwc_stub); +DECLARE_DISPATCH(qrelu_fn, qrelu_stub); +DECLARE_DISPATCH(qrelu_leaky_fn, qrelu_leaky_stub); +DECLARE_DISPATCH(qgelu_fn, qgelu_stub); +DECLARE_DISPATCH(qsigmoid_fn, qsigmoid_stub); +DECLARE_DISPATCH(qtanh_fn, qtanh_stub); +DECLARE_DISPATCH(qthreshold_fn, qthreshold_stub); +DECLARE_DISPATCH(qtopk_fn, qtopk_stub); +DECLARE_DISPATCH(qupsample_bilinear2d_fn, qupsample_bilinear2d_nhwc_stub); +DECLARE_DISPATCH(qmean_inner_dim_fn, qmean_inner_dim_stub); +DECLARE_DISPATCH(qstd_inner_dim_fn, qstd_inner_dim_stub); +DECLARE_DISPATCH(qprelu_fn, qprelu_stub); + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..72abe1ad817f484e0d269b31cf78b98bf0694e5a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/RuyUtils.h @@ -0,0 +1,21 @@ +#pragma once + +#ifdef USE_RUY_QMATMUL + +#include + +namespace at { +namespace native { +namespace ruy_utils { + +ruy::Context* get_ruy_context(); + +void quantize_multiplier(double scale, + int* multiplier_fixedpoint, + int* multiplier_exponent); + +} // namespace ruy_utils +} // namespace native +} // namespace + +#endif // USE_RUY_QMATMUL diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..ff334d4c8d48ceeb4fa83fdbcd2e678a3e2d887d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/XnnpackUtils.h @@ -0,0 +1,335 @@ +#pragma once + +#ifdef USE_XNNPACK +#include + +#include +#include + +using xnnpack_operator = at::native::xnnpack::Operator; + +namespace at { +namespace native { +namespace xnnp_utils { + +/* + * Return shape in the same order as the memory format + * e.g. channels_last will return NHWC instead of NCHW + */ +std::vector get_mem_format_aware_shape(const at::Tensor& in); + +/* + * Input is always int8_t, output can be [int8_t, uint8_t]. + * input + offset = output + * int8_t + 128 = uint8_t + * int8_t + 0 = int8_t + */ +template +void q8_copy_int8_weight_and_add_offset(const at::Tensor& in, at::Tensor& out); + +template +Tensor convert_conv_weights_to_channel_last_tensor( + const at::Tensor& src, + int groups, + bool transpose); + +/* + * Series of create wrapper functions to call xnn_create_[de]conv* functions. + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_create_convolution2d_nhwc( + uint32_t pad_top, + uint32_t pad_right, + uint32_t pad_bottom, + uint32_t pad_left, + uint32_t kernel_h, + uint32_t kernel_w, + uint32_t stride_h, + uint32_t stride_w, + uint32_t dilation_h, + uint32_t dilation_w, + uint32_t groups, + size_t group_input_channels, + size_t group_output_channels, + size_t ip_chan_stride, + size_t op_chan_stride, + int8_t izp, + float ip_scale, + int8_t kzp, + const float* k_scales, + const int8_t* kernel, + const int32_t* bias, + int8_t ozp, + float op_scale, + int8_t op_min, + int8_t op_max, + uint32_t flags, + xnn_operator_t* op, + bool per_channel, + bool transpose) { + /* Symmetric quantization forces kzp = 0 */ + TORCH_CHECK(!kzp, "XNNPACK Q[SC]8 conv kernels expects kernel zero point to be zero." + "But got: ", kzp); + + if (transpose) { + TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!"); + return xnn_create_deconvolution2d_nhwc_qs8( + pad_top, /* uint32_t output_padding_top */ + pad_right, /* uint32_t output_padding_right */ + pad_bottom, /* uint32_t output_padding_bottom */ + pad_left, /* uint32_t output_padding_left */ + kernel_h, /* uint32_t kernel_height */ + kernel_w, /* uint32_t kernel_width */ + stride_h, /* uint32_t stride_height */ + stride_w, /* uint32_t stride_width */ + dilation_h, /* uint32_t dilation_height */ + dilation_w, /* uint32_t dilation_width */ + groups, /* uint32_t groups */ + group_input_channels, /* size_t group_input_channels */ + group_output_channels, /* size_t group_output_channels */ + ip_chan_stride, /* size_t input_pixel_stride */ + op_chan_stride, /* size_t output_pixel_stride */ + izp, /* int8_t input_zero_point */ + ip_scale, /* float input_scale */ + k_scales[0], /* float kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + ozp, /* int8_t output_zero_point */ + op_scale, /* float output_scale */ + op_min, /* int8_t output_min */ + op_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + nullptr, /* xnn_caches_t caches */ + nullptr, /* xnn_weights_cache_t weights_cache */ + op); /* xnn_operator_t* deconvolution_op_out */ + + } + + if (!per_channel) { + return xnn_create_convolution2d_nhwc_qs8( + pad_top, /* uint32_t input_padding_top */ + pad_right, /* uint32_t input_padding_right */ + pad_bottom, /* uint32_t input_padding_bottom */ + pad_left, /* uint32_t input_padding_left */ + kernel_h, /* uint32_t kernel_height */ + kernel_w, /* uint32_t kernel_width */ + stride_h, /* uint32_t subsampling_height */ + stride_w, /* uint32_t subsampling_width */ + dilation_h, /* uint32_t dilation_height */ + dilation_w, /* uint32_t dilation_width */ + groups, /* uint32_t groups */ + group_input_channels, /* size_t group_input_channels */ + group_output_channels, /* size_t group_output_channels*/ + ip_chan_stride, /* size_t input_channel_stride */ + op_chan_stride, /* size_t output_channel_stride */ + izp, /* int8_t input_zero_point */ + ip_scale, /* float input_scale */ + k_scales[0], /* float kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + ozp, /* int8_t output_zero_point */ + op_scale, /* float output_scale */ + op_min, /* int8_t output_min */ + op_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + nullptr, /* xnn_caches_t caches */ + nullptr, /* xnn_weights_cache_t weights_cache */ + op); /* xnn_operator_t* convolution_op_out */ + } else { /* per_channel */ + return xnn_create_convolution2d_nhwc_qs8_qc8w( + pad_top, /* uint32_t input_padding_top */ + pad_right, /* uint32_t input_padding_right */ + pad_bottom, /* uint32_t input_padding_bottom */ + pad_left, /* uint32_t input_padding_left */ + kernel_h, /* uint32_t kernel_height */ + kernel_w, /* uint32_t kernel_width */ + stride_h, /* uint32_t subsampling_height */ + stride_w, /* uint32_t subsampling_width */ + dilation_h, /* uint32_t dilation_height */ + dilation_w, /* uint32_t dilation_width */ + groups, /* uint32_t groups */ + group_input_channels, /* size_t group_input_channels */ + group_output_channels, /* size_t group_output_channels*/ + ip_chan_stride, /* size_t input_channel_stride */ + op_chan_stride, /* size_t output_channel_stride */ + izp, /* int8_t input_zero_point */ + ip_scale, /* float input_scale */ + k_scales, /* const float* kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + ozp, /* int8_t output_zero_point */ + op_scale, /* float output_scale */ + op_min, /* int8_t output_min */ + op_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + nullptr, /* xnn_caches_t caches */ + nullptr, /* xnn_weights_cache_t weights_cache */ + op); /* xnn_operator_t* convolution_op_out */ + } +} + +/* + * Series of reshape wrapper functions to call xnn_reshape_[de]conv* functions. + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_reshape_convolution2d_nhwc( + xnn_operator_t op, + size_t batch, + size_t in_h, + size_t in_w, + pthreadpool_t pt_pool, + bool per_channel = false, + bool transpose = false, + uint32_t adj_h = 0, + uint32_t adj_w = 0) { + if(transpose) { + TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!"); + return xnn_reshape_deconvolution2d_nhwc_qs8( + op, /* xnn_operator_t deconvolution_op */ + batch, /* size_t batch_size */ + in_h, /* size_t input_height */ + in_w, /* size_t input_width */ + adj_h, /* uint32_t adjustment_height */ + adj_w, /* uint32_t adjustment_width */ + nullptr, /* size_t* output_height_out */ + nullptr, /* size_t* output_width_out */ + pt_pool); /* pthreadpool_t threadpool */ + } + + size_t workspace_size = SIZE_MAX; + size_t workspace_alignment = SIZE_MAX; + + if (!per_channel) { + return xnn_reshape_convolution2d_nhwc_qs8( + op, /* xnn_operator_t convolution_op */ + batch, /* size_t batch_size */ + in_h, /* size_t input_height */ + in_w, /* size_t input_width */ + &workspace_size, /* size_t* workspace_size */ + &workspace_alignment, /* size_t* workspace_alignment */ + nullptr, /* size_t* output_height_out */ + nullptr, /* size_t* output_width_out */ + pt_pool); /* pthreadpool_t threadpool */ + } else { /* per_channel */ + return xnn_reshape_convolution2d_nhwc_qs8_qc8w( + op, /* xnn_operator_t convolution_op */ + batch, /* size_t batch_size */ + in_h, /* size_t input_height */ + in_w, /* size_t input_width */ + &workspace_size, /* size_t* workspace_size */ + &workspace_alignment, /* size_t* workspace_alignment */ + nullptr, /* size_t* output_height_out */ + nullptr, /* size_t* output_width_out */ + pt_pool); /* pthreadpool_t threadpool */ + } +} + + +/* + * Series of setup wrapper functions to call xnn_setup_[de]conv* functions. + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_setup_convolution2d_nhwc( + xnn_operator_t op, + const int8_t* inp, + int8_t* outp, + bool per_channel = false, + bool transpose = false) { + if(transpose) { + TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!"); + + return xnn_setup_deconvolution2d_nhwc_qs8( + op, /* xnn_operator_t deconvolution_op */ + inp, /* const int8_t* input */ + outp); /* int8_t* output */ + } + + if (!per_channel) { + return xnn_setup_convolution2d_nhwc_qs8( + op, /* xnn_operator_t deconvolution_op */ + nullptr, /* void workspace */ + inp, /* const int8_t* input */ + outp); /* int8_t* output */ + } else { /* per_channel */ + return xnn_setup_convolution2d_nhwc_qs8_qc8w( + op, /* xnn_operator_t deconvolution_op */ + nullptr, /* void workspace */ + inp, /* const int8_t* input */ + outp); /* int8_t* output */ + } +} + + +/* + * Series of wrapper functions to call xnn_create* and xnn_setup* + * functions for linear + */ +C10_ALWAYS_INLINE +enum xnn_status xnnp_create_fully_connected_nc( + size_t input_channels, + size_t output_channels, + size_t input_stride, + size_t output_stride, + int8_t input_zero_point, + float input_scale, + int8_t kernel_zero_point, + float kernel_scale, + const int8_t* kernel, + const int32_t* bias, + int8_t output_zero_point, + float output_scale, + int8_t output_min, + int8_t output_max, + uint32_t flags, + xnn_operator_t* fully_connected_op_out) { + /* Symmetric quantization forces kzp = 0 */ + TORCH_CHECK(!kernel_zero_point, "XNNPACK QS8 linear kernel expects kernel zero point to be zero." + "But got: ", kernel_zero_point); + return xnn_create_fully_connected_nc_qs8( + input_channels, /* size_t input_channels */ + output_channels, /* size_t output_channels */ + input_stride, /* size_t input_stride */ + output_stride, /* size_t output_stride */ + input_zero_point, /* int8_t input_zero_point */ + input_scale, /* float input_scale */ + kernel_scale, /* float kernel_scale */ + kernel, /* const int8_t* kernel */ + bias, /* const int32_t* bias */ + output_zero_point, /* int8_t output_zero_point */ + output_scale, /* float output_scale */ + output_min, /* int8_t output_min */ + output_max, /* int8_t output_max */ + flags, /* uint32_t flags */ + nullptr, /* xnn_caches_t caches */ + nullptr, /* xnn_weights_cache_t */ + fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */ +} + +C10_ALWAYS_INLINE +enum xnn_status xnnp_reshape_fully_connected_nc( + xnn_operator_t fully_connected_op, + size_t batch_size, + pthreadpool_t threadpool) { + return xnn_reshape_fully_connected_nc_qs8( + fully_connected_op, /* xnn_operator_t fully_connected_op */ + batch_size, /* size_t batch_size */ + threadpool); /* pthreadpool_t threadpool */ +} + +C10_ALWAYS_INLINE +enum xnn_status xnnp_setup_fully_connected_nc( + xnn_operator_t fully_connected_op, + const int8_t* input, + int8_t* output) { + return xnn_setup_fully_connected_nc_qs8( + fully_connected_op, /* xnn_operator_t fully_connected_op */ + input, /* const int8_t* input */ + output /* int8_t* output */ + ); +} + +} // namespace xnnp_utils +} // namespace native +} // namespace at + +#endif // USE_XNNPACK diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h new file mode 100644 index 0000000000000000000000000000000000000000..85451fb57482a31c2165366505993c0ef67aa920 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/conv_serialization.h @@ -0,0 +1,414 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#if !defined(__s390x__) && !defined(__powerpc__) +#include +#endif + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#else +#include +#endif + + +#include + +/* Convolution prepacked parameters serialization. + * + * Version 1 + * + * - Fields: + * 1. weight + * 2. bias + * 3. stride x kSpatialDim + * 4. padding x kSpatialDim + * 5. dilation x kSpatialDim + * 6. groups + * + * Version 2 + * + * - Fields: + * 0. version (string) + * 1. list of non-optional tensors + * 0: packed parameters (int16_t) + * - kSpatialDim + * - stride x kSpatialDim + * - padding x kSpatialDim + * - dilation x kSpatialDim + * - output_padding x kSpatialDim + * - groups + * - transpose (0 or 1) + * 1: weight + * 2. list of optional tensors + * 0: bias + * + * Version 3 + * + * - Fields: + * 0. version (int64_t) + * 1. list of int64_t configuration values + * - kSpatialDim + * - stride x kSpatialDim + * - padding x kSpatialDim + * - dilation x kSpatialDim + * - output_padding x kSpatialDim + * - groups + * - flags (bitmask) + * - (1 << 0) transpose (1 = yes) + * 2. list of optional tensors + * 0: None (helps with type inference) + * 1: weight (this must be present) + * 2: bias + */ + +using ConvParamsSerializationTypeV2 = std::tuple< + // version, for versions 2 and up + std::string, + // non-optional tensors + std::vector, + // optional tensors + std::vector>>; + +using ConvParamsSerializationTypeV3 = std::tuple< + // version, int for versions 3 and up + int64_t, + // configuration values + std::vector, + // optional tensors + std::vector>>; + +// Parses any historical conv packed params format into +// the current format. +template +ConvParamsSerializationTypeV3 parse_conv_serialized_state(c10::IValue v) { + + // determine the version based on IValue contents + int version = -1; + if (v.isTuple()) { + const auto& elements = v.toTupleRef().elements(); + if (!elements.empty()) { + auto firstElement = elements[0]; + if (firstElement.isTensor()) { + version = 1; + } else if (firstElement.isString()) { + const std::string& version_str = firstElement.toStringRef(); + // note: not parsing the string to automatically handle bad + // inputs + if (version_str == "2") { + version = 2; + } + } else if (firstElement.isInt()) { + auto raw_version = firstElement.toInt(); + if (raw_version == 3) { + version = 3; + } + } + } + } + TORCH_INTERNAL_ASSERT(version != -1, "Unable to parse serialization version"); + + if (version == 1) { + // version 1 - convert to version 3 manually + + const auto& elements = v.toTupleRef().elements(); + + at::Tensor weight = elements[0].toTensor(); + std::optional bias = elements[1].toOptional(); + torch::List stride_x_kSpatialDim = elements[2].toTensorList(); + torch::List padding_x_kSpatialDim = elements[3].toTensorList(); + torch::List dilation_x_kSpatialDim = elements[4].toTensorList(); + at::Tensor groups = elements[5].toTensor(); + + std::vector config_vals; + config_vals.reserve( + stride_x_kSpatialDim.size() + padding_x_kSpatialDim.size() + + dilation_x_kSpatialDim.size() + kSpatialDim + 3); + config_vals.push_back(kSpatialDim); + for (const auto i : c10::irange(stride_x_kSpatialDim.size())) { + auto stride = stride_x_kSpatialDim.get(i); + config_vals.push_back(stride[0].item()); + } + for (const auto i : c10::irange(padding_x_kSpatialDim.size())) { + auto padding = padding_x_kSpatialDim.get(i); + config_vals.push_back(padding[0].item()); + } + for (const auto i : c10::irange(dilation_x_kSpatialDim.size())) { + auto dilation = dilation_x_kSpatialDim.get(i); + config_vals.push_back(dilation[0].item()); + } + // output_padding does not exist in v1, so we fill in a default value + for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) { + config_vals.push_back(0); + } + config_vals.push_back(groups[0].item()); + // transpose does not exist in v1, so we fill in a default value + config_vals.push_back(0); + + std::vector> tensors; + tensors.emplace_back(); + tensors.emplace_back(weight); + tensors.emplace_back(bias); + + int64_t version = 3; + return std::tie(version, config_vals, tensors); + } else if (version == 2) { + // version 2 + const auto& elements = v.toTupleRef().elements(); + std::vector non_optional = elements[1].toTensorList().vec(); + std::vector> optional; + + if (elements[2].isTensorList()) { + for (const auto& elem : elements[2].toTensorList()) { + optional.emplace_back(static_cast(elem)); + } + } else { + for (const auto& elem : elements[2].toList()) { + optional.emplace_back(static_cast(elem).toOptional()); + } + } + // create default optional value for bias + if (optional.empty()) { + optional.emplace_back(); + } + + auto config_a = non_optional[0].accessor(); + std::vector config_vals; + config_vals.reserve(config_a.size(0)); + for (const auto i : c10::irange(config_a.size(0))) { + config_vals.emplace_back(config_a[i]); + } + + auto weight = non_optional[1]; + auto bias = optional[0]; + + std::vector> tensors; + tensors.emplace_back(); + tensors.emplace_back(weight); + tensors.emplace_back(bias); + + int64_t version = 3; + return std::tie(version, config_vals, tensors); + } else if (version == 3) { + return v.to(); + } else { + TORCH_INTERNAL_ASSERT(false, "Unexpected serialized qconv version: ", + version); + } +} + +#define QCONV_SERIALIZATION_VERSION 2 + +#if QCONV_SERIALIZATION_VERSION == 2 +using ConvParamsSerializationType = ConvParamsSerializationTypeV2; + +template +ConvParamsSerializationTypeV2 serialize_conv( + const c10::intrusive_ptr>& params) { + + std::string version = "2"; + std::vector non_optional; + std::vector> optional; + + // create a packed int8_t tensor for conv params + std::vector params_vec; + params_vec.push_back(kSpatialDim); + auto stride = params->stride().vec(); + params_vec.insert(params_vec.end(), stride.begin(), stride.end()); + auto padding = params->padding().vec(); + params_vec.insert(params_vec.end(), padding.begin(), padding.end()); + auto dilation = params->dilation().vec(); + params_vec.insert(params_vec.end(), dilation.begin(), dilation.end()); + auto output_padding = params->output_padding().vec(); + params_vec.insert(params_vec.end(), output_padding.begin(), + output_padding.end()); + params_vec.push_back(params->groups()); + params_vec.push_back(params->transpose()); + int64_t vec_size = params_vec.size(); + at::Tensor params_tensor = at::from_blob( + params_vec.data(), {vec_size}, + at::TensorOptions().dtype(at::kShort)) + // clone to retain ownership of the data + .clone(); + + auto [weight, bias] = params->unpack(); + + non_optional.emplace_back(std::move(params_tensor)); + non_optional.emplace_back(std::move(weight)); + optional.emplace_back(std::move(bias)); + + return std::tie(version, non_optional, optional); +} + +#elif QCONV_SERIALIZATION_VERSION == 3 +using ConvParamsSerializationType = ConvParamsSerializationTypeV3; + +template +ConvParamsSerializationTypeV3 serialize_conv( + const c10::intrusive_ptr>& params) { + std::vector config_vals; + config_vals.push_back(kSpatialDim); + auto stride = params->stride().vec(); + config_vals.insert(config_vals.end(), stride.begin(), stride.end()); + auto padding = params->padding().vec(); + config_vals.insert(config_vals.end(), padding.begin(), padding.end()); + auto dilation = params->dilation().vec(); + config_vals.insert(config_vals.end(), dilation.begin(), dilation.end()); + auto output_padding = params->output_padding().vec(); + config_vals.insert(config_vals.end(), output_padding.begin(), + output_padding.end()); + config_vals.push_back(params->groups()); + config_vals.push_back(params->transpose()); + + auto [weight, bias] = params->unpack(); + + std::vector> tensors; + tensors.emplace_back(); + tensors.emplace_back(weight); + tensors.emplace_back(bias); + + int64_t version = 3; + return std::tie(version, config_vals, tensors); +} + +#else +#error "Invalid qconv serialization version." +#endif + +template +c10::intrusive_ptr> deserialize_conv( + ConvParamsSerializationTypeV3 state) { + auto [version, config_vals, tensors] = state; + TORCH_INTERNAL_ASSERT(version == 3, "Unexpected serialized qconv version: ", version); + + TORCH_CHECK(tensors.size() == 3, "Wrong number of tensors", tensors.size()); + std::optional weight = tensors[1]; + std::optional bias = tensors[2]; + TORCH_INTERNAL_ASSERT(weight, "Weight should always be present in serialized qconv."); + + torch::List stride, padding, output_padding, dilation; + // skip kSpatialDim + int idx = 1; + for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) { + stride.emplace_back(config_vals.at(idx)); + idx++; + } + for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) { + padding.emplace_back(config_vals.at(idx)); + idx++; + } + for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) { + dilation.emplace_back(config_vals.at(idx)); + idx++; + } + for (C10_UNUSED const auto i : c10::irange(kSpatialDim)) { + TORCH_INTERNAL_ASSERT(idx < static_cast(config_vals.size()), + "Unexpected index = ", idx, " for config_vals of size ", + config_vals.size()); + output_padding.emplace_back(config_vals.at(idx)); + idx++; + } + int64_t groups = config_vals.at(idx); + idx++; + int64_t flags = config_vals.at(idx); + idx++; + TORCH_INTERNAL_ASSERT(idx == static_cast(config_vals.size()), + "Unexpected length of config_vals, expected ", + idx, + " got ", + config_vals.size()); + + bool transpose = flags & (1 << 0); + + int64_t other_flags = flags & ~(1 << 0); + TORCH_INTERNAL_ASSERT(other_flags == 0, "Unexpected flags set in ", flags, "."); + + auto& ctx = at::globalContext(); + +#ifdef USE_FBGEMM + if (ctx.qEngine() == at::QEngine::X86) { +#if AT_MKLDNN_ENABLED() + bool use_onednn = onednn_utils::should_use_onednn_quant( + weight.value(), transpose, groups, output_padding); + if (use_onednn) { + return PackedConvWeightsOnednn::prepack( + weight.value(), + bias, + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif + return PackedConvWeight::prepack( + weight.value(), + bias, + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } // x86 +#endif + +#ifdef USE_FBGEMM + if (ctx.qEngine() == at::QEngine::FBGEMM) { + return PackedConvWeight::prepack( + weight.value(), + bias, + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif // USE_FBGEMM +#ifdef USE_PYTORCH_QNNPACK + if (ctx.qEngine() == at::QEngine::QNNPACK) { + TORCH_CHECK( + kSpatialDim == 2, + "prepack/__setstate__: QNNPACK only supports Conv2d " + "now."); + return PackedConvWeightsQnnp::prepack( + weight.value(), + bias, + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif // USE_PYTORCH_QNNPACK +#if AT_MKLDNN_ENABLED() + if (ctx.qEngine() == at::QEngine::ONEDNN) { + return PackedConvWeightsOnednn::prepack( + weight.value(), + bias, + stride, + padding, + output_padding, + dilation, + groups, + transpose + ); + } +#endif // AT_MKLDNN_ENABLED() +TORCH_CHECK( + false, + "Didn't find engine for when deserializing ConvPackedParams: ", + toString(ctx.qEngine())); +} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..407d6550574dc2cef42a5db7b9ffc5df92cfa2f9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/fbgemm_utils.h @@ -0,0 +1,413 @@ +#pragma once + +#include +#include +#include +#include +#include + +#ifdef USE_FBGEMM +#include +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Winconsistent-missing-destructor-override") +#include +C10_DIAGNOSTIC_POP() +#include + +// The struct for the packed weight matrix (PackBMatrix) and the corresponding +// column offsets used for the fully connect layer, which are both prepared in +// the prepacking step to save the computations in the inference. Note the +// column offsets include the sum of the B columns as well as the scalar term +// B_zero_point * K, whereas the row offsets created by +// PackAWithQuantRowOffset/PackAWithIm2Col/PackAWithRowOffset are only the sum +// of the A rows. The column offsets are needed for the asymmetric quantization +// (affine quantization) of input matrix. +// Note that in JIT mode we can think of a way to fuse col_offsets with bias. +struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase { + PackedLinearWeight( + std::unique_ptr> w, + std::optional bias, + std::vector col_offsets, + std::vector w_scale, + std::vector w_zp, + c10::QScheme q_scheme) + : w(std::move(w)), + bias_(std::move(bias)), + col_offsets(std::move(col_offsets)), + w_scale(std::move(w_scale)), + w_zp(std::move(w_zp)), + q_scheme(std::move(q_scheme)) {} + std::unique_ptr> w; + std::optional bias_; + std::vector col_offsets; + std::vector w_scale; + std::vector w_zp; + c10::QScheme q_scheme; + + at::Tensor apply( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + at::Tensor input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor& apply_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) override; + + at::Tensor& apply_relu_out( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output) override; + + at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) override; + + at::Tensor apply_with_input_q_dq_qweight_dq_relu_output_fp32( + at::Tensor input, + double input_scale, + int64_t input_zero_point) override; + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) + override; + + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) + override; + + std::tuple> unpack() override; + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + private: + template + at::Tensor& apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point, + at::Tensor& output); + + template + at::Tensor apply_with_input_q_dq_qweight_dq_output_fp32_impl( + const at::Tensor& input, + double input_scale, + int64_t input_zero_point); + + template + at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range = false); +}; + +struct TORCH_API PackedLinearWeightFp16 : public LinearPackedParamsBase { + PackedLinearWeightFp16( + std::unique_ptr w, + std::optional bias) + : w(std::move(w)), bias_(std::move(bias)) {} + + std::unique_ptr w; + std::optional bias_; + + at::Tensor apply( + at::Tensor /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/) override { + TORCH_INTERNAL_ASSERT(false); + } + at::Tensor apply_relu( + at::Tensor /*input*/, + double /*output_scale*/, + int64_t /*output_zero_point*/) override { + TORCH_INTERNAL_ASSERT(false); + } + + at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false) + override; + at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false) + override; + + at::Tensor& apply_dynamic_out( + const at::Tensor& input, + at::Tensor& output, + bool reduce_range = false) override; + at::Tensor& apply_dynamic_relu_out( + const at::Tensor& input, + at::Tensor& output, + bool reduce_range = false) override; + + std::tuple> unpack() override; + + std::optional bias() override { + return bias_; + } + + static c10::intrusive_ptr prepack( + at::Tensor weight, + std::optional bias); + + void set_bias(std::optional bias) override; + + private: + template + at::Tensor& apply_dynamic_impl(const at::Tensor& input, at::Tensor& output); +}; + +template +struct TORCH_API PackedConvWeight : public ConvPackedParamsBase { + PackedConvWeight( + std::unique_ptr> w, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + uint8_t transpose, + std::vector col_offsets, + std::vector kernel, + std::vector w_scale, + std::vector w_zp, + c10::QScheme q_scheme) + : w(std::move(w)), + bias(std::move(bias)), + stride_(std::move(stride)), + padding_(std::move(padding)), + output_padding_(std::move(output_padding)), + dilation_(std::move(dilation)), + groups_(groups), + transpose_(transpose), + col_offsets(std::move(col_offsets)), + kernel(std::move(kernel)), + w_scale(std::move(w_scale)), + w_zp(std::move(w_zp)), + q_scheme(q_scheme) {} + + std::unique_ptr> w; + std::optional bias; + torch::List stride_; + torch::List padding_; + torch::List output_padding_; + torch::List dilation_; + int64_t groups_; + uint8_t transpose_; + std::vector col_offsets; + std::vector kernel; + std::vector w_scale; + std::vector w_zp; + c10::QScheme q_scheme; + + at::Tensor apply( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_relu( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point) override; + + at::Tensor apply_dynamic( + const at::Tensor& input, + bool reduce_range) override; + + std::tuple> unpack() override; + + static c10::intrusive_ptr> prepack( + at::Tensor weight, + std::optional bias, + torch::List stride, + torch::List padding, + torch::List output_padding, + torch::List dilation, + int64_t groups, + bool transpose); + + const float* GetBiasData(at::Tensor* bias); + + void GetQuantizationParams( + float act_scale, + float out_scale, + std::vector* output_multiplier_float, + std::vector* act_times_w_scale); + + torch::List stride() const override { + return stride_; + } + + torch::List padding() const override { + return padding_; + } + + torch::List output_padding() const override { + return output_padding_; + } + + torch::List dilation() const override { + return dilation_; + } + + int64_t groups() const override { + return groups_; + } + + bool transpose() const override { + return (bool)transpose_; + } + + private: + template + at::Tensor apply_impl( + const at::Tensor& input, + double output_scale, + int64_t output_zero_point); +}; + +// PackWeight: Convert the weight from uint8 to int8. +inline void convert_uint8_int8( + int len, + const uint8_t* src_uint8, + int8_t* dst_int8) { + for (const auto i : c10::irange(len)) { + dst_int8[i] = static_cast(static_cast(src_uint8[i]) - 128); + } +} + +// UnpackWeight: Convert the weight from int8 to uint8. +inline void convert_int8_uint8( + int len, + const int8_t* src_int8, + uint8_t* dst_uint8) { + for (const auto i : c10::irange(len)) { + dst_uint8[i] = + static_cast(static_cast(src_int8[i]) + 128); + } +} + +namespace at { +namespace native { +namespace fbgemm_utils { + +template +fbgemm::conv_param_t MakeFbgemmConvParam( + int N, + int C, + int M, + const std::vector& image_shape, + int groups, + const std::vector& kernels, + const std::vector& strides, + const std::vector& pads, + const std::vector& dilations, + const std::vector& output_padding = std::vector(kSpatialDim, 0), + bool transposed = false); + +// TODO: Remove functions below when ChannelsLast3d is ready. +Tensor MakeStridedQTensorCPU( + const IntArrayRef& sizes, + const IntArrayRef& strides, + const TensorOptions& options, + QuantizerPtr quantizer); + +Tensor MakeEmptyAffineQuantizedChannelsLast3dTensor( + int64_t N, + int64_t C, + int64_t D, + int64_t H, + int64_t W, + const TensorOptions& options, + double scale, + int64_t zero_point); + +Tensor MakeEmptyPerChannelAffineQuantizedChannelsLast3dTensor( + int64_t N, + int64_t C, + int64_t D, + int64_t H, + int64_t W, + const TensorOptions& options, + const Tensor& scales, + const Tensor& zero_points); + +Tensor ConvertToChannelsLast3dTensor(const Tensor& src); + +template +Tensor TransposeConvTensorUnpackConversion(const Tensor& src, int groups); + +template +Tensor ConvertConvWeightsToChannelLastTensor( + const at::Tensor& src, + int groups, + bool transpose); +} // namespace fbgemm_utils +} // namespace native +} // namespace at + +#endif // USE_FBGEMM + +struct TORCH_API PackedEmbeddingBagWeight : public EmbeddingPackedParamsBase { + PackedEmbeddingBagWeight( + at::Tensor packed_w, + std::vector w_scale, + std::vector w_zp, + int64_t bit_rate, + c10::QScheme q_scheme, + int64_t version) + : packed_w(std::move(packed_w)), + w_scale(std::move(w_scale)), + w_zp(std::move(w_zp)), + bit_rate_(bit_rate), + q_scheme(q_scheme), + version_(version) { + // NOLINTNEXTLINE(clang-analyzer-cplusplus.Move) + if (!packed_w.is_contiguous()) { + packed_w = packed_w.contiguous(); + } + } + + at::Tensor packed_w; + std::vector w_scale; + std::vector w_zp; + int64_t bit_rate_; + c10::QScheme q_scheme; + int64_t version_; + + at::Tensor unpack() override; + static c10::intrusive_ptr prepack( + at::Tensor weight); + + int64_t bit_rate() const override { + return bit_rate_; + } + + int64_t version() const override { + return version_; + } + + at::Tensor embeddingbag_byte( + const at::Tensor& indices, + const std::optional& offsets, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) override; + + at::Tensor embeddingbag_4bit( + const at::Tensor& indices, + const std::optional& offsets, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset, + bool is_embedding_op) override; +}; diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h new file mode 100644 index 0000000000000000000000000000000000000000..dbfb406ea55dbb50f97b1e86efb52c337af04847 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/init_qnnpack.h @@ -0,0 +1,13 @@ +#pragma once + +#ifdef USE_PYTORCH_QNNPACK + +namespace at { +namespace native { + +void initQNNPACK(); + +} // namespace native +} // namespace at + +#endif diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h new file mode 100644 index 0000000000000000000000000000000000000000..644d85fa357ee21140986a2a225dfa937a68437e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag.h @@ -0,0 +1,34 @@ +#pragma once +#include +#include + +namespace at { +namespace native { +Tensor& embedding_bag_byte_rowwise_offsets_out( + Tensor& output, + const Tensor& weight, + const Tensor& indices, + const std::optional& offsets_in, + const bool /* scale_grad_by_freq */, + const int64_t /* mode */, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset); + +Tensor& embedding_bag_4bit_rowwise_offsets_out( + Tensor& output, + const Tensor& weight, + const Tensor& indices, + const std::optional& offsets_in, + const bool /* scale_grad_by_freq */, + const int64_t /* mode */, + bool pruned_weights, + const std::optional& per_sample_weights_, + const std::optional& compressed_indices_mapping, + bool include_last_offset); + +Tensor& qembeddingbag_byte_unpack_out(Tensor& output, const Tensor& packed_weight); + +} // native +} // at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h new file mode 100644 index 0000000000000000000000000000000000000000..0a65f3f07f397b931c1a4b6bd781e6308643117f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/quantized/cpu/qembeddingbag_prepack.h @@ -0,0 +1,13 @@ +#pragma once +#include + +namespace at { namespace native { + +Tensor& qembeddingbag_byte_prepack_out(Tensor& output, const Tensor& weight); + +Tensor qembeddingbag_byte_prepack(const Tensor& weight); + +Tensor qembeddingbag_byte_prepack_meta(const Tensor& weight); + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/transformers/attention.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/transformers/attention.h new file mode 100644 index 0000000000000000000000000000000000000000..49fbdc46ee2a687ad60b3233ed0be99737b51cf5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/transformers/attention.h @@ -0,0 +1,72 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace at { +namespace native { + +using fused_sdp_choice_fn = int64_t (*)(const Tensor& query_, const Tensor& key, const Tensor& value, + const std::optional& attn_mask_, double dropout_p, bool is_causal, std::optional scale, bool enable_gqa); + +DECLARE_DISPATCH(fused_sdp_choice_fn, _fused_sdp_choice_stub); + +TORCH_API Tensor bmm_nt(const Tensor& a, const Tensor& b); +TORCH_API Tensor masked_softmax( + Tensor& attn_scores, + std::optional attn_mask, + const Tensor& query, + std::optional mask_type = {}); + +using transform_bias_rescale_qkv_fn = void(*)( + at::ScalarType type, + void* _q_k_v, + const void* _qkv, + const void* _qkv_bias, + int64_t B, + int64_t T, + int64_t D, + int64_t num_head); + +DECLARE_DISPATCH(transform_bias_rescale_qkv_fn, transform_bias_rescale_qkv_stub); + +TORCH_API Tensor transform0213_gemm_nt_bias( + const Tensor& a, + const Tensor& b, + const Tensor& c, + const Tensor& query); + +TORCH_API Tensor bmm_nn(Tensor& out, const Tensor& a, const Tensor& b); + +TORCH_API void debug_assert_shape(int line, const Tensor& t, c10::IntArrayRef shape); + +TORCH_API Tensor qkv_projection( + const Tensor& query, + const Tensor& key, + const Tensor& value, + const int64_t embed_dim, + const Tensor& qkv_weight); + +using flash_attention_fn = void (*)( + const Tensor& output, const Tensor& logsumexp, + const Tensor& query, const Tensor& key, const Tensor& value, + double dropout_p, bool is_causal, + std::optional attn_mask, + std::optional scale); + +using flash_attention_backward_fn = void (*)( + const Tensor& grad_q, const Tensor& grad_k, + const Tensor& grad_v, const Tensor& grad_out, + const Tensor& query, const Tensor& key, + const Tensor& value, const Tensor& out, const Tensor& logsumexp, + double dropout_p, bool is_causal, + std::optional attn_mask, + std::optional scale); + +DECLARE_DISPATCH(flash_attention_fn, flash_attention_kernel); +DECLARE_DISPATCH(flash_attention_backward_fn, flash_attention_backward_kernel); + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/transformers/sdp_utils_cpp.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/transformers/sdp_utils_cpp.h new file mode 100644 index 0000000000000000000000000000000000000000..272700392e1d71e73788dd03fc66e14153040937 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/transformers/sdp_utils_cpp.h @@ -0,0 +1,566 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace sdp { + +constexpr int32_t num_backends = 5; +enum class SDPBackend { + error = -1, + math = 0, + flash_attention = 1, + efficient_attention = 2, + cudnn_attention = 3, + overrideable = 4 +}; + +// Note that if this changed make sure to update +// the templated enum in mem_eff/kernel_forward.h and mem_eff/kernel_backward.h +enum class CustomMaskType { + NoCustomMask = 0, + CausalFromTopLeft = 1, + CausalFromBottomRight = 2, + NumCustomMaskTypes, +}; + +struct sdp_params { + at::Tensor query; + at::Tensor key; + at::Tensor value; + std::optional attn_mask; + double dropout; + bool is_causal; + bool enable_gqa; +}; + +SDPBackend select_sdp_backend_cpp(sdp_params const& kernel_params); + +inline c10::SymFloat calculate_scale( + const at::Tensor& query, + std::optional scale) { + const auto softmax_scale = scale.has_value() + ? scale.value() + : (c10::SymFloat(1.0) / (c10::SymFloat(query.sym_size(-1)).sqrt())); + return c10::SymFloat(softmax_scale); +} + +using c10::array_of; + +inline bool input_requires_grad(sdp_params const& params) { + const bool any_inputs_require_grad = params.query.requires_grad() || + params.key.requires_grad() || params.value.requires_grad(); + const bool gradmode_enabled = at::GradMode::is_enabled(); + return any_inputs_require_grad && gradmode_enabled; +} + +inline bool has_for_nested_inputs(sdp_params const& params) { + return + (params.query.is_nested() && params.query.layout() == c10::kStrided) || + (params.key.is_nested() && params.key.layout() == c10::kStrided) || + (params.value.is_nested() && params.value.layout() == c10::kStrided); +} + +inline bool has_for_dense_inputs(sdp_params const& params) { + return !params.query.is_nested() || !params.key.is_nested() || !params.value.is_nested(); +} + +inline bool has_only_dense_inputs(sdp_params const& params) { + return !params.query.is_nested() && !params.key.is_nested() && !params.value.is_nested(); +} + +template +inline bool check_tensor_dtype( + sdp_params const& params, + dtype_vector allowed_dtypes, + bool debug) { + auto query_dtype = params.query.dtype(); + if (!(query_dtype == params.key.dtype() && + query_dtype == params.value.dtype() && + (std::find(allowed_dtypes.begin(), allowed_dtypes.end(), query_dtype) != + allowed_dtypes.end()))) { + if (debug) { + TORCH_WARN( + "Expected query, key and value to all be of dtype: {", + c10::Join(", ", allowed_dtypes), + "}. Got ", + "Query dtype: ", + params.query.dtype(), + ", Key dtype: ", + params.key.dtype(), + ", and Value dtype: ", + params.value.dtype(), + " instead."); + } + return false; + } + return true; +} + + +inline bool try_broadcast_param_size( + const c10::SymInt q_size, + const c10::SymInt k_size, + const c10::SymInt v_size, + c10::string_view param_name, + bool debug) { + auto max_size = std::max({q_size, k_size, v_size}); + if ((q_size != max_size && q_size != 1) || + (k_size != max_size && k_size != 1) || + (v_size != max_size && v_size != 1)) { + if (debug) { + TORCH_WARN( + "Both fused kernels require query, key and value to have broadcastable ", + param_name, + "got Query ", + param_name, + q_size, + ", Key ", + param_name, + k_size, + ", Value ", + param_name, + v_size, + " instead."); + } + return false; + } + return true; +} + +inline bool check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper( + at::Tensor const& param, + c10::string_view param_name, + bool debug) { + const auto nt_tensor_impl = at::native::get_nested_tensor_impl(param); + const at::Tensor& sizes = nt_tensor_impl->get_nested_sizes(); + auto num_head_dims = nt_tensor_impl->opt_size(1); + if (!num_head_dims.has_value()) { + // num_head_dims is ragged + if (debug) { + TORCH_WARN( + "Fused kernels do not support ragged num_head_dims, ", + param_name, + "has a ragged num_heads."); + } + return false; + } + + auto* sizes_ptr = sizes.data_ptr(); + const int64_t n_tensors = param.size(0); + const int64_t size_tensor_stride = sizes.stride(0); + + // This is being called inside sdp with shape [batch, heads, {seq_len}, dim] + for (const auto i : c10::irange(n_tensors)) { + if (sizes_ptr[(i * size_tensor_stride) + 1] == 0) { + if (debug) { + TORCH_WARN( + "Fused kernels do not support seq_len == 0, ", + param_name, + "has a seq len of 0."); + } + return false; + } + } + return true; +} + +inline bool check_for_seq_len_0_nested_tensor(sdp_params const& params, bool debug) { + // When this function is called we are assured that the nt is dim==4 + bool q_is_safe = params.query.is_nested() + ? check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper( + params.query, "query ", debug) + : true; + // short circuit if any is unsafe + if (!q_is_safe) { + return false; + } + + bool k_is_safe = params.key.is_nested() + ? check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper( + params.key, "key ", debug) + : true; + if (!k_is_safe) { + return false; + } + + bool v_is_safe = params.value.is_nested() + ? check_for_seq_len_0_and_consistent_head_dim_nested_tensor_helper( + params.value, "value ", debug) + : true; + if (!v_is_safe) { + return false; + } + + // We now know none of the inputs have ragged num_heads, so we can safely + // access .size(1) + auto q_num_heads = params.query.size(1); + auto k_num_heads = params.key.size(1); + auto v_num_heads = params.value.size(1); + bool same_num_heads = + q_num_heads == k_num_heads && q_num_heads == v_num_heads; + + if (!same_num_heads) { + if (input_requires_grad(params)){ + if (debug) { + TORCH_WARN( + "Both fused kernels do not support training with broadcasted NT inputs."); + } + return false; + } + return try_broadcast_param_size( + q_num_heads, k_num_heads, v_num_heads, "num heads ", debug); + } + + return true; +} + +inline bool check_nested_tensor(sdp_params const& params, bool debug) { + // Return false if have nested tensor + if (!has_only_dense_inputs(params)) { + if (debug) { + TORCH_WARN( + "Both fused kernels of cpp version currently do not support Nested Tensor inputs."); + } + return false; + } + return true; +} + +inline bool check_for_dropout(sdp_params const& params, bool debug) { + if (params.dropout > 0.0) { + if (debug) { + TORCH_WARN("Both fused kernels do not support non-zero dropout."); + } + return false; + } + return true; +} + +inline bool check_requires_grad_and_nested(sdp_params const& params, bool debug) { + if (input_requires_grad(params)) { + if (debug) { + TORCH_WARN( + "Memory efficient attention currently doesn't support training with NT inputs."); + } + return false; + } + return true; +} + +inline bool check_for_attn_mask(sdp_params const& params, bool debug) { + if (params.attn_mask.has_value()) { + if (debug) { + TORCH_WARN("Flash Attention does not support non-null attn_mask."); + } + return false; + } + return true; +} + +inline bool check_attn_mask_shape(sdp_params const& params, bool debug) { + auto attn_mask = params.attn_mask; + if (!attn_mask.has_value()) { + return true; + } + if (attn_mask.value().requires_grad()) { + return false; + } + auto batchSize = params.query.sym_size(0); + auto qSize = params.query.sym_size(2); + auto kvSize = params.key.sym_size(2); + auto num_head = params.query.sym_size(1); + if (attn_mask.value().sym_size(-2) != qSize && attn_mask.value().sym_size(-2) != 1) { + return false; + } + if (attn_mask.value().sym_size(-1) != kvSize && attn_mask.value().sym_size(-1) != 1) { + return false; + } + if (attn_mask.value().dim() == 2) { + return true; + } else if (attn_mask.value().dim() == 4) { + if ((attn_mask.value().sym_size(0) == 1 || attn_mask.value().sym_size(0) == batchSize) + && (attn_mask.value().sym_size(1) == 1 || attn_mask.value().sym_size(1) == num_head)) { + return true; + } + } + if (debug) { + TORCH_WARN("Please use the following attn mask shapes: ", + "2d - ({Q_seq_len, 1} x {KV_seq_len, 1}); ", + "4d - ({Batch, 1} x {Num_heads, 1} x {Q_seq_len, 1} x {KV_seq_len, 1})"); + } + return false; +} + +inline bool check_tensor_shapes(sdp_params const& params, bool debug) { + auto query_dim = params.query.dim(); + if (!(query_dim == params.key.dim() && query_dim == params.value.dim() && + (query_dim == 4))) { + if (debug) { + TORCH_WARN( + "All fused kernels requires query, key and value to be 4 dimensional, but got Query dim: ", + query_dim, + ", Key dim: ", + params.key.dim(), + ", Value dim: ", + params.value.dim(), + " instead."); + } + return false; + } + return true; +} + +inline bool check_safe_kv_broadcast(at::Tensor const& param, bool debug) { + const auto nt_tensor_impl = at::native::get_nested_tensor_impl(param); + auto seq_len = nt_tensor_impl->opt_size(2); + if (!seq_len.has_value()) { + if (debug) { + TORCH_WARN( + "For both fused kernels, if one of key/value batch_size requires " + "broadcasting and the other does not, then the other must have a ", + "consistent seq_len dim.") + } + return false; + } + return true; +} + +inline bool check_grouped_query_attention(sdp_params const& params, bool debug) { + const auto q_num_heads = params.query.sym_size(-3); + const auto k_num_heads = params.key.sym_size(-3); + const auto v_num_heads = params.value.sym_size(-3); + const bool same_kv_heads = k_num_heads == v_num_heads; + + if (!(same_kv_heads)){ + if (debug) { + TORCH_WARN( + "Both fused kernels require key and value to have the same num_heads and batch_size but got: ", + "Key sizes: ", + params.key.sizes(), + ", Value sizes: ", + params.value.sizes(), + ", Query sizes: ", + params.query.sizes(), + " instead."); + } + return false; + } + // Check if grouped query attention is supported and validate the number of + // heads + if (q_num_heads % k_num_heads != 0) { + if (debug) { + TORCH_WARN( + "FlashAttentionV2 only supports grouped query attention, where the number of heads in key/value must divide number of heads in query.", + "Got input Key sizes(): ", + params.key.sym_size(-3), + ", Value sizes(): ", + params.value.sym_size(-3), + ", Query sizes(): ", + params.query.sym_size(-3), + " instead."); + } + return false; + } + return true; +} + +template +inline bool check_batch_size_and_num_heads_dense(sdp_params const& params, bool debug) { + // This is expected to be called after check_tensor_shapes ensuring that the + // size() calls won't error since the inputs are all 4 dimensional + + auto q_batch_size = params.query.sym_size(0); + auto k_batch_size = params.key.sym_size(0); + auto v_batch_size = params.value.sym_size(0); + + bool same_batch_size = + q_batch_size == k_batch_size && q_batch_size == v_batch_size; + + auto q_num_heads = params.query.sym_size(-3); + auto k_num_heads = params.key.sym_size(-3); + auto v_num_heads = params.value.sym_size(-3); + + bool same_num_heads = + q_num_heads == k_num_heads && q_num_heads == v_num_heads; + + if (!same_batch_size){ + if(debug) { + TORCH_WARN( + "For dense inputs, both fused kernels require query, key and value to have the same batch_size. ", + "Query.sizes(): ", + params.query.sizes(), + ", Key.sizes(): ", + params.key.sizes(), + ", Value.sizes(): ", + params.value.sizes(), + " instead. To broadcast dense inputs, try using unsqueeze and expand_to before passing them into the kernel."); + } + return false; + } + + if(params.enable_gqa && supports_gqa){ + return check_grouped_query_attention(params, debug); + } + + if (!same_num_heads){ + if (debug) { + TORCH_WARN( + "For dense input, both fused kernels require query, key and value to have the same num_heads. ", + "Query.sizes(): ", + params.query.sizes(), + ", Key sizes(): ", + params.key.sizes(), + ", Value sizes(): ", + params.value.sizes(), + " instead. To broadcast dense inputs, try using unsqueeze and expand_to before passing them into the kernel."); + } + return false; + } + // If all checks pass, return true + return true; +} + +inline bool check_batch_size_nested(sdp_params const& params, bool debug) { + // This is expected to be called after check_tensor_shapes ensuring that the + // size() calls won't error since the inputs are all 4 dimensional + auto q_batch_size = params.query.sym_size(0); + auto k_batch_size = params.key.sym_size(0); + auto v_batch_size = params.value.sym_size(0); + + bool same_batch_size = + q_batch_size == k_batch_size && q_batch_size == v_batch_size; + + // num_heads logic for nested input is checked in + // check_for_seq_len_0_nested_tensor as there is handling there to make sure + // num_heads is not ragged + bool broadcastable_batch_size = true; + if (!same_batch_size) { + if (input_requires_grad(params)){ + if (debug) { + TORCH_WARN( + "Both fused kernels do not support training with broadcasted NT inputs."); + } + return false; + } + // try to broadcast batchsize + broadcastable_batch_size = try_broadcast_param_size( + q_batch_size, k_batch_size, v_batch_size, "batch size ", debug); + + // if only one of k or v require broadcasting of batch size, the other + // must have a consistent seq_len dim + if (broadcastable_batch_size) { + if (k_batch_size == 1 && v_batch_size != 1 && + !check_safe_kv_broadcast(params.value, debug)) { + return false; + } + if (v_batch_size == 1 && k_batch_size != 1 && + !check_safe_kv_broadcast(params.key, debug)) { + return false; + } + } + } + return broadcastable_batch_size; +} + +inline bool check_nonzero_sequence_lengths_dense(sdp_params const& params, bool debug) { + // In some cases people will pass in 0 sized tensors, this will + // cause the fused path to error with unaligned mask + bool zero_seq_len_q = params.query.sym_size(-2) == 0; + bool zero_seq_len_k = params.key.sym_size(-2) == 0; + if (zero_seq_len_q || zero_seq_len_k) { + if (debug) { + TORCH_WARN( + "All fused kernels do not support zero seq_len_q or seq_len_kv."); + } + return false; + } + return true; +} + +template +inline bool check_last_dim_stride_equals_1_dense(sdp_params const& params, bool debug) { + // The stride checking for NestedTensors is done within the kernel + // And .contiguous will be called if needed + + // This function checks that the last dimension of the inputs to + // fused_attention have stride 1 + bool qkv_strides_equal_1 = params.query.sym_stride(-1) == 1 && + params.key.sym_stride(-1) == 1 && params.value.sym_stride(-1) == 1; + + // https://github.com/pytorch/pytorch/issues/116333 + // If the head_dim is size 1 the stride won't matter, but we + // check this condition before padding the head_dim to 1 + if (ignore_singleton_dim){ + qkv_strides_equal_1 = qkv_strides_equal_1 || params.query.sym_size(-1) == 1; + } + bool mask_stride_equal_1 = params.attn_mask.has_value() + ? params.attn_mask.value().sym_stride(-1) == 1 + : true; + if (!(qkv_strides_equal_1 && mask_stride_equal_1)) { + if (debug) { + std::ostringstream epilogue_message; + if (params.attn_mask.has_value()) { + epilogue_message << ", Attn_mask.stride(-1): " + << params.attn_mask.value().sym_stride(-1); + } + epilogue_message << " instead."; + TORCH_WARN( + "All fused kernels require the last dimension of the input to have stride 1. ", + "Got Query.stride(-1): ", + params.query.sym_stride(-1), + ", Key.stride(-1): ", + params.key.sym_stride(-1), + ", Value.stride(-1): ", + params.value.sym_stride(-1), + epilogue_message.str()); + } + + return false; + } + return true; +} + +inline bool check_runtime_disabled_flash(sdp_params const& params, bool debug) { + // We check the global context to see if user has explicitly turned of flash + // sdp kernels + if (!at::globalContext().userEnabledFlashSDP()) { + if (debug) { + TORCH_WARN("Flash attention has been runtime disabled."); + } + return false; + } + return true; +} + +inline bool check_runtime_disabled_mem_efficient(sdp_params const& params, bool debug) { + // We check the global context to see if user has explicitly turned of + // mem_efficient sdp kernels + if (!at::globalContext().userEnabledMemEfficientSDP()) { + if (debug) { + TORCH_WARN("Memory Efficient attention has been runtime disabled."); + } + return false; + } + return true; +} + + +} // namespace sdp diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/Factory.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/Factory.h new file mode 100644 index 0000000000000000000000000000000000000000..b0302417cdce06523958c9e7c66c08b54aa18410 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/Factory.h @@ -0,0 +1,24 @@ +#pragma once + +#include + +namespace at { +namespace native { +namespace mobile { + +Tensor allocate_padded_contiguous_if_needed( + const Tensor& input, + c10::MemoryFormat memory_format); + +// TODO: Remove this function when at::native::empty() is modified to accept a +// custom memory allocator. + +at::Tensor empty_with_tail_padding( + IntArrayRef size, + const caffe2::TypeMeta dtype, + c10::MemoryFormat memory_format, + std::optional maybe_names); + +} // namespace mobile +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/ParamUtils.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/ParamUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..adb5f1cfa49f9726db5a9304b2546b1ceff52eb3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/ParamUtils.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include + +namespace at { +namespace native { + +template +inline std::vector _expand_param_if_needed( + ArrayRef list_param, + const char* param_name, + int64_t expected_dim) { + if (list_param.size() == 1) { + return std::vector(expected_dim, list_param[0]); + } else if ((int64_t)list_param.size() != expected_dim) { + std::ostringstream ss; + ss << "expected " << param_name << " to be a single integer value or a " + << "list of " << expected_dim << " values to match the convolution " + << "dimensions, but got " << param_name << "=" << list_param; + AT_ERROR(ss.str()); + } else { + return list_param.vec(); + } +} + +inline std::vector expand_param_if_needed( + IntArrayRef list_param, + const char* param_name, + int64_t expected_dim) { + return _expand_param_if_needed(list_param, param_name, expected_dim); +} + +inline std::vector expand_param_if_needed( + SymIntArrayRef list_param, + const char* param_name, + int64_t expected_dim) { + return _expand_param_if_needed(list_param, param_name, expected_dim); +} + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/ParamsHash.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/ParamsHash.h new file mode 100644 index 0000000000000000000000000000000000000000..6b7894cb8549f59d34b9b52d660780b729ada575 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/native/utils/ParamsHash.h @@ -0,0 +1,104 @@ +#pragma once + +#include +#include +#include + +namespace at::native { + +// Hashing machinery for Params +// Fowler–Noll–Vo hash function +// see +// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function +template +struct ParamsHash { + // Params must be a POD because we read out its memory + // contents as char* when hashing + static_assert(std::is_standard_layout_v, "Params is not POD"); + + size_t operator()(const Params& params) const { + auto ptr = reinterpret_cast(¶ms); + uint32_t value = 0x811C9DC5; + for (const auto i : c10::irange(sizeof(Params))) { + value ^= ptr[i]; + value *= 0x01000193; + } + return (size_t)value; + } +}; + +template +struct ParamsEqual { + // Params must be a POD because we read out its memory + // contents as char* when comparing + static_assert(std::is_standard_layout_v, "Params is not POD"); + + bool operator()(const Params& a, const Params& b) const { + auto ptr1 = reinterpret_cast(&a); + auto ptr2 = reinterpret_cast(&b); + return memcmp(ptr1, ptr2, sizeof(Params)) == 0; + } +}; + +// Provide explicit byte-for-byte constructors to avoid uwittingly leaving +// padding bytes unitialized (e.g., when passing Params by value) +template +struct ParamsWrapper { + T pod; + static_assert( + std::is_standard_layout_v, + "ParamsWrapper cannot wrap non-POD data"); + + ParamsWrapper() { + memset(&(this->pod), 0, sizeof(this->pod)); + } + + ParamsWrapper(const ParamsWrapper& other) { + memcpy(&(this->pod), &(other.pod), sizeof(this->pod)); + } + + ParamsWrapper(ParamsWrapper&& other) noexcept { + memcpy(&(this->pod), &(other.pod), sizeof(this->pod)); + } + + ParamsWrapper& operator=(const ParamsWrapper& other) { + memcpy(&(this->pod), &(other.pod), sizeof(this->pod)); + return *this; + } + + ParamsWrapper& operator=(ParamsWrapper&& other) noexcept { + memcpy(&(this->pod), &(other.pod), sizeof(this->pod)); + return *this; + } + + inline friend bool operator==( + const ParamsWrapper& lhs, + const ParamsWrapper& rhs) noexcept { + auto ptr1 = reinterpret_cast(&(lhs.pod)); + auto ptr2 = reinterpret_cast(&(rhs.pod)); + return memcmp(ptr1, ptr2, sizeof(lhs.pod)) == 0; + } +}; + +// Wrapped version: this allows the outer struct to have custom copy and move +// constructors for additional safety +template +struct ParamsWrapperHash { + // Params must be a POD because we read out its memory + // contents as char* when hashing + static_assert( + std::is_standard_layout_v, + "ParamsWrapper cannot wrap non-POD data"); + + size_t operator()(const ParamsWrapper& params_wrapper) const { + auto ptr = reinterpret_cast(&(params_wrapper.pod)); + uint32_t value = 0x811C9DC5; + for (const auto i : c10::irange(sizeof(params_wrapper.pod))) { + value ^= ptr[i]; + value *= 0x01000193; + } + return (size_t)value; + } +}; + +} // namespace at::native diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_empty_per_channel_affine_quantized.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_empty_per_channel_affine_quantized.h new file mode 100644 index 0000000000000000000000000000000000000000..038954ed0ba84d923b0e6af82f2c64b23239084e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_empty_per_channel_affine_quantized.h @@ -0,0 +1,113 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::_empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor +inline at::Tensor _empty_per_channel_affine_quantized(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options={}, ::std::optional memory_format=c10::MemoryFormat::Contiguous) { + return at::_ops::_empty_per_channel_affine_quantized::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); +} +namespace symint { + template ::value>> + at::Tensor _empty_per_channel_affine_quantized(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options={}, ::std::optional memory_format=c10::MemoryFormat::Contiguous) { + return at::_ops::_empty_per_channel_affine_quantized::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); + } +} + +// aten::_empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor +inline at::Tensor _empty_per_channel_affine_quantized(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format) { + return at::_ops::_empty_per_channel_affine_quantized::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, dtype, layout, device, pin_memory, memory_format); +} +namespace symint { + template ::value>> + at::Tensor _empty_per_channel_affine_quantized(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format) { + return at::_ops::_empty_per_channel_affine_quantized::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, dtype, layout, device, pin_memory, memory_format); + } +} + +// aten::_empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor +inline at::Tensor _empty_per_channel_affine_quantized_symint(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options={}, ::std::optional memory_format=c10::MemoryFormat::Contiguous) { + return at::_ops::_empty_per_channel_affine_quantized::call(size, scales, zero_points, axis, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); +} +namespace symint { + template ::value>> + at::Tensor _empty_per_channel_affine_quantized(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, at::TensorOptions options={}, ::std::optional memory_format=c10::MemoryFormat::Contiguous) { + return at::_ops::_empty_per_channel_affine_quantized::call(size, scales, zero_points, axis, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt(), c10::impl::check_tensor_options_and_extract_memory_format(options, memory_format)); + } +} + +// aten::_empty_per_channel_affine_quantized(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, MemoryFormat? memory_format=contiguous_format) -> Tensor +inline at::Tensor _empty_per_channel_affine_quantized_symint(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format) { + return at::_ops::_empty_per_channel_affine_quantized::call(size, scales, zero_points, axis, dtype, layout, device, pin_memory, memory_format); +} +namespace symint { + template ::value>> + at::Tensor _empty_per_channel_affine_quantized(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory, ::std::optional memory_format) { + return at::_ops::_empty_per_channel_affine_quantized::call(size, scales, zero_points, axis, dtype, layout, device, pin_memory, memory_format); + } +} + +// aten::_empty_per_channel_affine_quantized.out(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _empty_per_channel_affine_quantized_out(at::Tensor & out, at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format=c10::MemoryFormat::Contiguous) { + return at::_ops::_empty_per_channel_affine_quantized_out::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, memory_format, out); +} +namespace symint { + template ::value>> + at::Tensor & _empty_per_channel_affine_quantized_out(at::Tensor & out, at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format=c10::MemoryFormat::Contiguous) { + return at::_ops::_empty_per_channel_affine_quantized_out::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, memory_format, out); + } +} + +// aten::_empty_per_channel_affine_quantized.out(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _empty_per_channel_affine_quantized_outf(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format, at::Tensor & out) { + return at::_ops::_empty_per_channel_affine_quantized_out::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, memory_format, out); +} +namespace symint { + template ::value>> + at::Tensor & _empty_per_channel_affine_quantized_outf(at::IntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format, at::Tensor & out) { + return at::_ops::_empty_per_channel_affine_quantized_out::call(c10::fromIntArrayRefSlow(size), scales, zero_points, axis, memory_format, out); + } +} + +// aten::_empty_per_channel_affine_quantized.out(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _empty_per_channel_affine_quantized_symint_out(at::Tensor & out, c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format=c10::MemoryFormat::Contiguous) { + return at::_ops::_empty_per_channel_affine_quantized_out::call(size, scales, zero_points, axis, memory_format, out); +} +namespace symint { + template ::value>> + at::Tensor & _empty_per_channel_affine_quantized_out(at::Tensor & out, c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format=c10::MemoryFormat::Contiguous) { + return at::_ops::_empty_per_channel_affine_quantized_out::call(size, scales, zero_points, axis, memory_format, out); + } +} + +// aten::_empty_per_channel_affine_quantized.out(SymInt[] size, *, Tensor scales, Tensor zero_points, int axis, MemoryFormat? memory_format=contiguous_format, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & _empty_per_channel_affine_quantized_symint_outf(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format, at::Tensor & out) { + return at::_ops::_empty_per_channel_affine_quantized_out::call(size, scales, zero_points, axis, memory_format, out); +} +namespace symint { + template ::value>> + at::Tensor & _empty_per_channel_affine_quantized_outf(c10::SymIntArrayRef size, const at::Tensor & scales, const at::Tensor & zero_points, int64_t axis, ::std::optional memory_format, at::Tensor & out) { + return at::_ops::_empty_per_channel_affine_quantized_out::call(size, scales, zero_points, axis, memory_format, out); + } +} + +} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_addcmul_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_addcmul_native.h new file mode 100644 index 0000000000000000000000000000000000000000..0be9636383b059f1a7ae02294033307ca0f4be9b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_addcmul_native.h @@ -0,0 +1,35 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API ::std::vector foreach_tensor_addcmul_scalar_slow(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1); +TORCH_API void _foreach_addcmul_Scalar_out(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value, at::TensorList out); +TORCH_API void foreach_tensor_addcmul_scalar_slow_(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1); +TORCH_API ::std::vector foreach_tensor_addcmul_scalar_cuda(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1); +TORCH_API void foreach_tensor_addcmul_scalar_cuda_(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Scalar & value=1); +TORCH_API ::std::vector foreach_tensor_addcmul_scalarlist_slow(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef scalars); +TORCH_API void _foreach_addcmul_ScalarList_out(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef scalars, at::TensorList out); +TORCH_API void foreach_tensor_addcmul_scalarlist_slow_(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef scalars); +TORCH_API ::std::vector foreach_tensor_addcmul_scalarlist_cuda(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef scalars); +TORCH_API void foreach_tensor_addcmul_scalarlist_cuda_(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, at::ArrayRef scalars); +TORCH_API ::std::vector foreach_tensor_addcmul_tensor_slow(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars); +TORCH_API void _foreach_addcmul_Tensor_out(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars, at::TensorList out); +TORCH_API void foreach_tensor_addcmul_tensor_slow_(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars); +TORCH_API ::std::vector foreach_tensor_addcmul_tensor_cuda(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars); +TORCH_API void foreach_tensor_addcmul_tensor_cuda_(at::TensorList self, at::TensorList tensor1, at::TensorList tensor2, const at::Tensor & scalars); +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_expm1_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_expm1_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..23021887b559d20b51da2b961b21b210c14563c6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_foreach_expm1_ops.h @@ -0,0 +1,50 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _foreach_expm1 { + using schema = ::std::vector (at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_expm1") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_expm1(Tensor[] self) -> Tensor[]") + static ::std::vector call(at::TensorList self); + static ::std::vector redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self); +}; + +struct TORCH_API _foreach_expm1_ { + using schema = void (at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_expm1_") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_expm1_(Tensor(a!)[] self) -> ()") + static void call(at::TensorList self); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self); +}; + +struct TORCH_API _foreach_expm1_out { + using schema = void (at::TensorList, at::TensorList); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_foreach_expm1") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_foreach_expm1.out(Tensor[] self, *, Tensor(a!)[] out) -> ()") + static void call(at::TensorList self, at::TensorList out); + static void redispatch(c10::DispatchKeySet dispatchKeySet, at::TensorList self, at::TensorList out); +}; + +}} // namespace at::_ops diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_mask_projection_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_mask_projection_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..420e3703d34a14b61e41a6251b2e8b84feb0580d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_mask_projection_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _sparse_mask_projection { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, bool); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_mask_projection") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_mask_projection(Tensor self, Tensor mask, bool accumulate_matches=False) -> Tensor") + static at::Tensor call(const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches); +}; + +struct TORCH_API _sparse_mask_projection_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, bool, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_mask_projection") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_mask_projection.out(Tensor self, Tensor mask, bool accumulate_matches=False, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, bool accumulate_matches, at::Tensor & out); +}; + +}} // namespace at::_ops diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sum_backward_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sum_backward_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..57a5999e055083a5aebe599ac25fa68736d3e767 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_sparse_sum_backward_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _sparse_sum_backward { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, at::IntArrayRef); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_sum_backward") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_sum_backward(Tensor grad, Tensor self, int[] dim) -> Tensor") + static at::Tensor call(const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim); +}; + +struct TORCH_API _sparse_sum_backward_out { + using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, at::IntArrayRef, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_sparse_sum_backward") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_sparse_sum_backward.out(Tensor grad, Tensor self, int[] dim, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & grad, const at::Tensor & self, at::IntArrayRef dim, at::Tensor & out); +}; + +}} // namespace at::_ops diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_copy_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_copy_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..7f9dcd4e1ed0ddf5157fc17f67554ef0cf9a6d69 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_test_autograd_multiple_dispatch_view_copy_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _test_autograd_multiple_dispatch_view_copy { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_test_autograd_multiple_dispatch_view_copy") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_test_autograd_multiple_dispatch_view_copy(Tensor self) -> Tensor") + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +struct TORCH_API _test_autograd_multiple_dispatch_view_copy_out { + using schema = at::Tensor & (const at::Tensor &, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_test_autograd_multiple_dispatch_view_copy") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_test_autograd_multiple_dispatch_view_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & self, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::Tensor & out); +}; + +}} // namespace at::_ops diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c797b98f0e9d84cbe1c88fd0875f1c094b9fc521 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/_unsafe_masked_index_put_accumulate_ops.h @@ -0,0 +1,28 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API _unsafe_masked_index_put_accumulate { + using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const c10::List<::std::optional> &, const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::_unsafe_masked_index_put_accumulate") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "_unsafe_masked_index_put_accumulate(Tensor self, Tensor mask, Tensor?[] indices, Tensor values) -> Tensor") + static at::Tensor call(const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional> & indices, const at::Tensor & values); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & mask, const c10::List<::std::optional> & indices, const at::Tensor & values); +}; + +}} // namespace at::_ops diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/alias_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/alias_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..c70954c94dbbb9e2793741fefaf1919b021dd8c3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/alias_ops.h @@ -0,0 +1,28 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API alias { + using schema = at::Tensor (const at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::alias") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "alias(Tensor(a) self) -> Tensor(a)") + static at::Tensor call(const at::Tensor & self); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self); +}; + +}} // namespace at::_ops diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arcsinh_compositeimplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arcsinh_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7e2987cc5ce81e8d6764dfb6fa3951b257eaeed9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/arcsinh_compositeimplicitautograd_dispatch.h @@ -0,0 +1,26 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor arcsinh(const at::Tensor & self); +TORCH_API at::Tensor & arcsinh_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & arcsinh_outf(const at::Tensor & self, at::Tensor & out); +TORCH_API at::Tensor & arcsinh_(at::Tensor & self); + +} // namespace compositeimplicitautograd +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/block_diag_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/block_diag_native.h new file mode 100644 index 0000000000000000000000000000000000000000..b1b76be9646bf755eb896eb79f8efde2ed3a665c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/block_diag_native.h @@ -0,0 +1,22 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor block_diag(at::TensorList tensors); +TORCH_API at::Tensor & block_diag_out(at::TensorList tensors, at::Tensor & out); +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeexplicitautogradnonfunctional_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeexplicitautogradnonfunctional_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..e504a5eaf81ddf258dd7682f86be0f21731d5c2e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeexplicitautogradnonfunctional_dispatch.h @@ -0,0 +1,23 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautogradnonfunctional { + +TORCH_API at::Tensor cat(const at::ITensorListRef & tensors, int64_t dim=0); + +} // namespace compositeexplicitautogradnonfunctional +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeimplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeimplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3427483dc742ceb26cf35ae0c68792a51bcf6a7b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cat_compositeimplicitautograd_dispatch.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeimplicitautograd { + +TORCH_API at::Tensor cat(at::TensorList tensors, at::Dimname dim); +TORCH_API at::Tensor & cat_out(at::Tensor & out, at::TensorList tensors, at::Dimname dim); +TORCH_API at::Tensor & cat_outf(at::TensorList tensors, at::Dimname dim, at::Tensor & out); + +} // namespace compositeimplicitautograd +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..946edb16883ffccb46a7ecf3811f29ffa77664de --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/cudnn_grid_sampler_cuda_dispatch.h @@ -0,0 +1,23 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor cudnn_grid_sampler(const at::Tensor & self, const at::Tensor & grid); + +} // namespace cuda +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diag_embed_compositeexplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diag_embed_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..857a490cd2a051f30549884f1b728d0212aa18e3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/diag_embed_compositeexplicitautograd_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor & diag_embed_out(at::Tensor & out, const at::Tensor & self, int64_t offset=0, int64_t dim1=-2, int64_t dim2=-1); +TORCH_API at::Tensor & diag_embed_outf(const at::Tensor & self, int64_t offset, int64_t dim1, int64_t dim2, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fake_quantize_per_channel_affine.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fake_quantize_per_channel_affine.h new file mode 100644 index 0000000000000000000000000000000000000000..3cb5f4ec837a8b97ce402b27cb4cb40b7ccc0903 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fake_quantize_per_channel_affine.h @@ -0,0 +1,30 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::fake_quantize_per_channel_affine(Tensor self, Tensor scale, Tensor zero_point, int axis, int quant_min, int quant_max) -> Tensor +inline at::Tensor fake_quantize_per_channel_affine(const at::Tensor & self, const at::Tensor & scale, const at::Tensor & zero_point, int64_t axis, int64_t quant_min, int64_t quant_max) { + return at::_ops::fake_quantize_per_channel_affine::call(self, scale, zero_point, axis, quant_min, quant_max); +} + +} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fill_meta_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fill_meta_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..97fe69bc140d0408365f51e546b2ebbc0350a031 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/fill_meta_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace meta { + +TORCH_API at::Tensor & fill_(at::Tensor & self, const at::Scalar & value); +TORCH_API at::Tensor & fill_(at::Tensor & self, const at::Tensor & value); + +} // namespace meta +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/frexp.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/frexp.h new file mode 100644 index 0000000000000000000000000000000000000000..b1643f7b5fe635dc54e209a33b204c5be7dbffc1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/frexp.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent) +inline ::std::tuple frexp(const at::Tensor & self) { + return at::_ops::frexp_Tensor::call(self); +} + +// aten::frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent) +inline ::std::tuple frexp_out(at::Tensor & mantissa, at::Tensor & exponent, const at::Tensor & self) { + return at::_ops::frexp_Tensor_out::call(self, mantissa, exponent); +} +// aten::frexp.Tensor_out(Tensor self, *, Tensor(a!) mantissa, Tensor(b!) exponent) -> (Tensor(a!) mantissa, Tensor(b!) exponent) +inline ::std::tuple frexp_outf(const at::Tensor & self, at::Tensor & mantissa, at::Tensor & exponent) { + return at::_ops::frexp_Tensor_out::call(self, mantissa, exponent); +} + +} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gelu_backward.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gelu_backward.h new file mode 100644 index 0000000000000000000000000000000000000000..3f860cdd20e4aa57aa8794ba4e9fb618f7a3dc7c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/gelu_backward.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & gelu_backward_out(at::Tensor & grad_input, const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate="none") { + return at::_ops::gelu_backward_grad_input::call(grad_output, self, approximate, grad_input); +} +// aten::gelu_backward.grad_input(Tensor grad_output, Tensor self, *, str approximate='none', Tensor(a!) grad_input) -> Tensor(a!) +inline at::Tensor & gelu_backward_outf(const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate, at::Tensor & grad_input) { + return at::_ops::gelu_backward_grad_input::call(grad_output, self, approximate, grad_input); +} + +// aten::gelu_backward(Tensor grad_output, Tensor self, *, str approximate='none') -> Tensor +inline at::Tensor gelu_backward(const at::Tensor & grad_output, const at::Tensor & self, c10::string_view approximate="none") { + return at::_ops::gelu_backward::call(grad_output, self, approximate); +} + +} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/i0_meta.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/i0_meta.h new file mode 100644 index 0000000000000000000000000000000000000000..7c4c85ba0f0773c9cada314ffa34fd8d2822267b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/i0_meta.h @@ -0,0 +1,27 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeMetaFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace meta { + +struct TORCH_API structured_i0 : public TensorIteratorBase { + + + void meta(const at::Tensor & self); +}; + +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_pinned_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_pinned_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..f592d7baa8b30ee7a0acb6ed043d9c9f41532932 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/is_pinned_ops.h @@ -0,0 +1,28 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API is_pinned { + using schema = bool (const at::Tensor &, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::is_pinned") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "is_pinned(Tensor self, Device? device=None) -> bool") + static bool call(const at::Tensor & self, ::std::optional device); + static bool redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, ::std::optional device); +}; + +}} // namespace at::_ops diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lstm.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lstm.h new file mode 100644 index 0000000000000000000000000000000000000000..9d29fb9ea6594a8a01660326d3c482e0e687c779 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/lstm.h @@ -0,0 +1,35 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::lstm.input(Tensor input, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional, bool batch_first) -> (Tensor, Tensor, Tensor) +inline ::std::tuple lstm(const at::Tensor & input, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional, bool batch_first) { + return at::_ops::lstm_input::call(input, hx, params, has_biases, num_layers, dropout, train, bidirectional, batch_first); +} + +// aten::lstm.data(Tensor data, Tensor batch_sizes, Tensor[] hx, Tensor[] params, bool has_biases, int num_layers, float dropout, bool train, bool bidirectional) -> (Tensor, Tensor, Tensor) +inline ::std::tuple lstm(const at::Tensor & data, const at::Tensor & batch_sizes, at::TensorList hx, at::TensorList params, bool has_biases, int64_t num_layers, double dropout, bool train, bool bidirectional) { + return at::_ops::lstm_data::call(data, batch_sizes, hx, params, has_biases, num_layers, dropout, train, bidirectional); +} + +} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_scatter.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_scatter.h new file mode 100644 index 0000000000000000000000000000000000000000..e34eb9d752867f101a92e0ce22cb6ee1026ecb75 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/masked_scatter.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::masked_scatter(Tensor self, Tensor mask, Tensor source) -> Tensor +inline at::Tensor masked_scatter(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source) { + return at::_ops::masked_scatter::call(self, mask, source); +} + +// aten::masked_scatter.out(Tensor self, Tensor mask, Tensor source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & masked_scatter_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source) { + return at::_ops::masked_scatter_out::call(self, mask, source, out); +} +// aten::masked_scatter.out(Tensor self, Tensor mask, Tensor source, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & masked_scatter_outf(const at::Tensor & self, const at::Tensor & mask, const at::Tensor & source, at::Tensor & out) { + return at::_ops::masked_scatter_out::call(self, mask, source, out); +} + +} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multinomial_ops.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multinomial_ops.h new file mode 100644 index 0000000000000000000000000000000000000000..22df2ecabc6cf5319a069bcad8250ad1624ad254 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/multinomial_ops.h @@ -0,0 +1,39 @@ +#pragma once + +// @generated by torchgen/gen.py from Operator.h + +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { +namespace _ops { + + +struct TORCH_API multinomial_out { + using schema = at::Tensor & (const at::Tensor &, int64_t, bool, ::std::optional, at::Tensor &); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::multinomial") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "out") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "multinomial.out(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None, Tensor(a!) out) -> Tensor(a!)") + static at::Tensor & call(const at::Tensor & self, int64_t num_samples, bool replacement, ::std::optional generator, at::Tensor & out); + static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t num_samples, bool replacement, ::std::optional generator, at::Tensor & out); +}; + +struct TORCH_API multinomial { + using schema = at::Tensor (const at::Tensor &, int64_t, bool, ::std::optional); + using ptr_schema = schema*; + // See Note [static constexpr char* members for windows NVCC] + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(name, "aten::multinomial") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(overload_name, "") + STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(schema_str, "multinomial(Tensor self, int num_samples, bool replacement=False, *, Generator? generator=None) -> Tensor") + static at::Tensor call(const at::Tensor & self, int64_t num_samples, bool replacement, ::std::optional generator); + static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, int64_t num_samples, bool replacement, ::std::optional generator); +}; + +}} // namespace at::_ops diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_native.h new file mode 100644 index 0000000000000000000000000000000000000000..64b0867f188b54f1bbcc7724d12cac678dbde6fb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nested_to_padded_tensor_native.h @@ -0,0 +1,21 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor nested_to_padded_tensor(const at::Tensor & self, double padding, at::OptionalIntArrayRef output_size=::std::nullopt); +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/new_full.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/new_full.h new file mode 100644 index 0000000000000000000000000000000000000000..b87e46547b31fdea76c67de4a72e898c65f9ff3c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/new_full.h @@ -0,0 +1,97 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +namespace symint { + template ::value>> + at::Tensor new_full(const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options={}) { + return at::_ops::new_full::call(self, c10::fromIntArrayRefSlow(size), fill_value, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); + } +} + +namespace symint { + template ::value>> + at::Tensor new_full(const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::new_full::call(self, c10::fromIntArrayRefSlow(size), fill_value, dtype, layout, device, pin_memory); + } +} + +namespace symint { + template ::value>> + at::Tensor new_full(const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::TensorOptions options={}) { + return at::_ops::new_full::call(self, size, fill_value, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); + } +} + +namespace symint { + template ::value>> + at::Tensor new_full(const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::new_full::call(self, size, fill_value, dtype, layout, device, pin_memory); + } +} + +// aten::new_full.out(Tensor self, SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & new_full_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value) { + return at::_ops::new_full_out::call(self, c10::fromIntArrayRefSlow(size), fill_value, out); +} +namespace symint { + template ::value>> + at::Tensor & new_full_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value) { + return at::_ops::new_full_out::call(self, c10::fromIntArrayRefSlow(size), fill_value, out); + } +} + +// aten::new_full.out(Tensor self, SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & new_full_outf(const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, at::Tensor & out) { + return at::_ops::new_full_out::call(self, c10::fromIntArrayRefSlow(size), fill_value, out); +} +namespace symint { + template ::value>> + at::Tensor & new_full_outf(const at::Tensor & self, at::IntArrayRef size, const at::Scalar & fill_value, at::Tensor & out) { + return at::_ops::new_full_out::call(self, c10::fromIntArrayRefSlow(size), fill_value, out); + } +} + +// aten::new_full.out(Tensor self, SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & new_full_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value) { + return at::_ops::new_full_out::call(self, size, fill_value, out); +} +namespace symint { + template ::value>> + at::Tensor & new_full_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value) { + return at::_ops::new_full_out::call(self, size, fill_value, out); + } +} + +// aten::new_full.out(Tensor self, SymInt[] size, Scalar fill_value, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & new_full_symint_outf(const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::Tensor & out) { + return at::_ops::new_full_out::call(self, size, fill_value, out); +} +namespace symint { + template ::value>> + at::Tensor & new_full_outf(const at::Tensor & self, c10::SymIntArrayRef size, const at::Scalar & fill_value, at::Tensor & out) { + return at::_ops::new_full_out::call(self, size, fill_value, out); + } +} + +} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_forward_cpu_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_forward_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..85925396e31178f3d1ab049698d365e6e2d29333 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/nll_loss_forward_cpu_dispatch.h @@ -0,0 +1,28 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API ::std::tuple nll_loss_forward(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, int64_t ignore_index); +TORCH_API ::std::tuple nll_loss_forward_symint(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index); +TORCH_API ::std::tuple nll_loss_forward_out(at::Tensor & output, at::Tensor & total_weight, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, int64_t ignore_index); +TORCH_API ::std::tuple nll_loss_forward_outf(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, int64_t ignore_index, at::Tensor & output, at::Tensor & total_weight); +TORCH_API ::std::tuple nll_loss_forward_symint_out(at::Tensor & output, at::Tensor & total_weight, const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index); +TORCH_API ::std::tuple nll_loss_forward_symint_outf(const at::Tensor & self, const at::Tensor & target, const ::std::optional & weight, int64_t reduction, c10::SymInt ignore_index, at::Tensor & output, at::Tensor & total_weight); + +} // namespace cpu +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ones_compositeexplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ones_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..7d83a3147201af3e69b736a4227ac0be2d2870ae --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/ones_compositeexplicitautograd_dispatch.h @@ -0,0 +1,34 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API at::Tensor ones(at::IntArrayRef size, ::std::optional names, at::TensorOptions options={}); +TORCH_API at::Tensor ones(at::IntArrayRef size, ::std::optional names, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +TORCH_API at::Tensor & ones_out(at::Tensor & out, at::IntArrayRef size, ::std::optional names); +TORCH_API at::Tensor & ones_outf(at::IntArrayRef size, ::std::optional names, at::Tensor & out); +TORCH_API at::Tensor ones(at::IntArrayRef size, at::TensorOptions options={}); +TORCH_API at::Tensor ones(at::IntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +TORCH_API at::Tensor ones_symint(c10::SymIntArrayRef size, at::TensorOptions options={}); +TORCH_API at::Tensor ones_symint(c10::SymIntArrayRef size, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); +TORCH_API at::Tensor & ones_out(at::Tensor & out, at::IntArrayRef size); +TORCH_API at::Tensor & ones_outf(at::IntArrayRef size, at::Tensor & out); +TORCH_API at::Tensor & ones_symint_out(at::Tensor & out, c10::SymIntArrayRef size); +TORCH_API at::Tensor & ones_symint_outf(c10::SymIntArrayRef size, at::Tensor & out); + +} // namespace compositeexplicitautograd +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cpu_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cpu_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..3c6bcbfc97fb839cb850562c37df29668341c78c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/remainder_cpu_dispatch.h @@ -0,0 +1,27 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cpu { + +TORCH_API at::Tensor remainder(const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & remainder_out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor & remainder_outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out); +TORCH_API at::Tensor & remainder_(at::Tensor & self, const at::Tensor & other); +TORCH_API at::Tensor remainder(const at::Scalar & self, const at::Tensor & other); + +} // namespace cpu +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d.h new file mode 100644 index 0000000000000000000000000000000000000000..de27b4f4722e228130ddcdf056e430e360577e77 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/replication_pad1d.h @@ -0,0 +1,91 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); +} +namespace symint { + template ::value>> + at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); + } +} + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); +} +namespace symint { + template ::value>> + at::Tensor & replication_pad1d_outf(const at::Tensor & self, at::IntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, c10::fromIntArrayRefSlow(padding), out); + } +} + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_symint_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, padding, out); +} +namespace symint { + template ::value>> + at::Tensor & replication_pad1d_out(at::Tensor & out, const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d_out::call(self, padding, out); + } +} + +// aten::replication_pad1d.out(Tensor self, SymInt[2] padding, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & replication_pad1d_symint_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, padding, out); +} +namespace symint { + template ::value>> + at::Tensor & replication_pad1d_outf(const at::Tensor & self, c10::SymIntArrayRef padding, at::Tensor & out) { + return at::_ops::replication_pad1d_out::call(self, padding, out); + } +} + +// aten::replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor +inline at::Tensor replication_pad1d(const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, c10::fromIntArrayRefSlow(padding)); +} +namespace symint { + template ::value>> + at::Tensor replication_pad1d(const at::Tensor & self, at::IntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, c10::fromIntArrayRefSlow(padding)); + } +} + +// aten::replication_pad1d(Tensor self, SymInt[2] padding) -> Tensor +inline at::Tensor replication_pad1d_symint(const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, padding); +} +namespace symint { + template ::value>> + at::Tensor replication_pad1d(const at::Tensor & self, c10::SymIntArrayRef padding) { + return at::_ops::replication_pad1d::call(self, padding); + } +} + +} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rshift.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rshift.h new file mode 100644 index 0000000000000000000000000000000000000000..5e457a2885253bb6041bdd49ae34a4b1731452a2 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/rshift.h @@ -0,0 +1,53 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::__rshift__.Scalar(Tensor self, Scalar other) -> Tensor +inline at::Tensor __rshift__(const at::Tensor & self, const at::Scalar & other) { + return at::_ops::__rshift___Scalar::call(self, other); +} + +// aten::__rshift__.Tensor(Tensor self, Tensor other) -> Tensor +inline at::Tensor __rshift__(const at::Tensor & self, const at::Tensor & other) { + return at::_ops::__rshift___Tensor::call(self, other); +} + +// aten::__rshift__.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & __rshift___out(at::Tensor & out, const at::Tensor & self, const at::Scalar & other) { + return at::_ops::__rshift___Scalar_out::call(self, other, out); +} +// aten::__rshift__.Scalar_out(Tensor self, Scalar other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & __rshift___outf(const at::Tensor & self, const at::Scalar & other, at::Tensor & out) { + return at::_ops::__rshift___Scalar_out::call(self, other, out); +} + +// aten::__rshift__.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & __rshift___out(at::Tensor & out, const at::Tensor & self, const at::Tensor & other) { + return at::_ops::__rshift___Tensor_out::call(self, other, out); +} +// aten::__rshift__.Tensor_out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & __rshift___outf(const at::Tensor & self, const at::Tensor & other, at::Tensor & out) { + return at::_ops::__rshift___Tensor_out::call(self, other, out); +} + +} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_native.h new file mode 100644 index 0000000000000000000000000000000000000000..81a90e7aa3c11bbe1960038296a60d5468846192 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/select_copy_native.h @@ -0,0 +1,23 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace at { +namespace native { +TORCH_API at::Tensor & select_copy_int_out_symint(const at::Tensor & self, int64_t dim, c10::SymInt index, at::Tensor & out); +TORCH_API at::Tensor select_copy_sparse_csr(const at::Tensor & self, int64_t dim, int64_t index); +TORCH_API at::Tensor select_copy_symint(const at::Tensor & self, int64_t dim, c10::SymInt index); +} // namespace native +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfcx_cuda_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfcx_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..8797587a5c47da989771e0fd3dd15b2cc2337ecb --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/special_erfcx_cuda_dispatch.h @@ -0,0 +1,25 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor special_erfcx(const at::Tensor & self); +TORCH_API at::Tensor & special_erfcx_out(at::Tensor & out, const at::Tensor & self); +TORCH_API at::Tensor & special_erfcx_outf(const at::Tensor & self, at::Tensor & out); + +} // namespace cuda +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices.h new file mode 100644 index 0000000000000000000000000000000000000000..679abf2f9cd1a31e45bf27c3cd5382ebe47408d5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices.h @@ -0,0 +1,43 @@ +#pragma once + +// @generated by torchgen/gen.py from Function.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +#include + +namespace at { + + +// aten::tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong) { + return at::_ops::tril_indices::call(row, col, offset, c10::optTypeMetaToScalarType(options.dtype_opt()), options.layout_opt(), options.device_opt(), options.pinned_memory_opt()); +} +// aten::tril_indices(int row, int col, int offset=0, *, ScalarType? dtype=long, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor +inline at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory) { + return at::_ops::tril_indices::call(row, col, offset, dtype, layout, device, pin_memory); +} + +// aten::tril_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & tril_indices_out(at::Tensor & out, int64_t row, int64_t col, int64_t offset=0) { + return at::_ops::tril_indices_out::call(row, col, offset, out); +} +// aten::tril_indices.out(int row, int col, int offset=0, *, Tensor(a!) out) -> Tensor(a!) +inline at::Tensor & tril_indices_outf(int64_t row, int64_t col, int64_t offset, at::Tensor & out) { + return at::_ops::tril_indices_out::call(row, col, offset, out); +} + +} diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices_cuda_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices_cuda_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..22378108e53f1463b36bbc19037c9a63e23a396e --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/tril_indices_cuda_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace cuda { + +TORCH_API at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset=0, at::TensorOptions options=at::kLong); +TORCH_API at::Tensor tril_indices(int64_t row, int64_t col, int64_t offset, ::std::optional dtype, ::std::optional layout, ::std::optional device, ::std::optional pin_memory); + +} // namespace cuda +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unique_dim_compositeexplicitautograd_dispatch.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unique_dim_compositeexplicitautograd_dispatch.h new file mode 100644 index 0000000000000000000000000000000000000000..cfd1ca3d1de705406ebb785410716dc9ea861d9d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/unique_dim_compositeexplicitautograd_dispatch.h @@ -0,0 +1,24 @@ +#pragma once +// @generated by torchgen/gen.py from DispatchKeyFunction.h + +// NB: The implementing C++ file is RegisterDispatchKey.cpp + +// The only #includes we need are for custom classes that have defaults in the C++ API +#include +#include +#include + +// Forward declarations of any types needed in the operator signatures. +// We can't directly include these classes because it will cause circular include dependencies. +// This file is included by TensorBody.h, which defines the Tensor class. +#include + +namespace at { + +namespace compositeexplicitautograd { + +TORCH_API ::std::tuple unique_dim_out(at::Tensor & out0, at::Tensor & out1, at::Tensor & out2, const at::Tensor & self, int64_t dim, bool sorted=true, bool return_inverse=false, bool return_counts=false); +TORCH_API ::std::tuple unique_dim_outf(const at::Tensor & self, int64_t dim, bool sorted, bool return_inverse, bool return_counts, at::Tensor & out0, at::Tensor & out1, at::Tensor & out2); + +} // namespace compositeexplicitautograd +} // namespace at diff --git a/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_linear1d_native.h b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_linear1d_native.h new file mode 100644 index 0000000000000000000000000000000000000000..c413a2b881a99980203f6827ddc498d2fb48ece9 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/torch/include/ATen/ops/upsample_linear1d_native.h @@ -0,0 +1,27 @@ +#pragma once + +// @generated by torchgen/gen.py from NativeFunction.h + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at { +namespace native { +TORCH_API at::Tensor upsample_linear1d(const at::Tensor & input, at::OptionalIntArrayRef output_size, bool align_corners, ::std::optional> scale_factors); +struct TORCH_API structured_upsample_linear1d_out_cpu : public at::meta::structured_upsample_linear1d { +void impl(const at::Tensor & self, at::ArrayRef output_size, bool align_corners, ::std::optional scales, const at::Tensor & out); +}; +struct TORCH_API structured_upsample_linear1d_out_cuda : public at::meta::structured_upsample_linear1d { +void impl(const at::Tensor & self, at::ArrayRef output_size, bool align_corners, ::std::optional scales, const at::Tensor & out); +}; +} // namespace native +} // namespace at